Copy disabled (too large)
Download .txt
Showing preview only (14,972K chars total). Download the full file to get everything.
Repository: AIGC-Audio/AudioGPT
Branch: main
Commit: a674543c537b
Files: 362
Total size: 14.2 MB
Directory structure:
gitextract_mez1vc95/
├── .gitignore
├── LICENSE
├── NeuralSeq/
│ ├── LICENSE
│ ├── README.md
│ ├── configs/
│ │ ├── config_base.yaml
│ │ ├── singing/
│ │ │ ├── base.yaml
│ │ │ └── fs2.yaml
│ │ └── tts/
│ │ ├── base.yaml
│ │ ├── base_zh.yaml
│ │ ├── emotion/
│ │ │ ├── base_text2mel.yaml
│ │ │ └── pre_align.py
│ │ ├── fs2.yaml
│ │ ├── hifigan.yaml
│ │ ├── libritts/
│ │ │ ├── base_text2mel.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── pre_align.py
│ │ │ └── pwg.yaml
│ │ ├── lj/
│ │ │ ├── base_mel2wav.yaml
│ │ │ ├── base_text2mel.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── hifigan.yaml
│ │ │ └── pwg.yaml
│ │ └── pwg.yaml
│ ├── data_gen/
│ │ └── tts/
│ │ ├── base_binarizer.py
│ │ ├── base_binarizer_emotion.py
│ │ ├── base_preprocess.py
│ │ ├── binarizer_zh.py
│ │ ├── data_gen_utils.py
│ │ ├── emotion/
│ │ │ ├── audio.py
│ │ │ ├── inference.py
│ │ │ ├── model.py
│ │ │ ├── params_data.py
│ │ │ ├── params_model.py
│ │ │ └── test_emotion.py
│ │ ├── txt_processors/
│ │ │ ├── __init__.py
│ │ │ ├── base_text_processor.py
│ │ │ ├── en.py
│ │ │ ├── zh.py
│ │ │ └── zh_g2pM.py
│ │ └── wav_processors/
│ │ ├── __init__.py
│ │ ├── base_processor.py
│ │ └── common_processors.py
│ ├── egs/
│ │ ├── datasets/
│ │ │ └── audio/
│ │ │ ├── emotion/
│ │ │ │ ├── base_text2mel.yaml
│ │ │ │ └── pre_align.py
│ │ │ ├── libritts/
│ │ │ │ ├── base_text2mel.yaml
│ │ │ │ ├── fs2.yaml
│ │ │ │ ├── pre_align.py
│ │ │ │ └── pwg.yaml
│ │ │ ├── lj/
│ │ │ │ ├── base_mel2wav.yaml
│ │ │ │ ├── preprocess.py
│ │ │ │ └── pwg.yaml
│ │ │ └── vctk/
│ │ │ ├── base_mel2wav.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── pre_align.py
│ │ │ └── pwg.yaml
│ │ └── egs_bases/
│ │ ├── config_base.yaml
│ │ ├── svs/
│ │ │ ├── base.yaml
│ │ │ ├── lj_ds_beta6.yaml
│ │ │ ├── midi/
│ │ │ │ ├── cascade/
│ │ │ │ │ └── opencs/
│ │ │ │ │ ├── aux_rel.yaml
│ │ │ │ │ ├── ds60_rel.yaml
│ │ │ │ │ └── opencpop_statis.yaml
│ │ │ │ ├── e2e/
│ │ │ │ │ ├── opencpop/
│ │ │ │ │ │ ├── ds1000-10dil.yaml
│ │ │ │ │ │ ├── ds1000.yaml
│ │ │ │ │ │ └── ds100_adj_rel.yaml
│ │ │ │ │ └── popcs/
│ │ │ │ │ └── ds100_adj_rel.yaml
│ │ │ │ └── pe.yaml
│ │ │ ├── popcs_ds_beta6.yaml
│ │ │ ├── popcs_ds_beta6_offline.yaml
│ │ │ └── popcs_fs2.yaml
│ │ └── tts/
│ │ ├── base.yaml
│ │ ├── base_zh.yaml
│ │ ├── fs2.yaml
│ │ ├── fs2_adv.yaml
│ │ ├── ps.yaml
│ │ ├── ps_flow.yaml
│ │ ├── ps_flow_small.yaml
│ │ └── vocoder/
│ │ ├── base.yaml
│ │ ├── hifigan.yaml
│ │ └── pwg.yaml
│ ├── gitattributes
│ ├── inference/
│ │ ├── svs/
│ │ │ ├── base_svs_infer.py
│ │ │ ├── ds_cascade.py
│ │ │ ├── ds_e2e.py
│ │ │ └── opencpop/
│ │ │ ├── cpop_pinyin2ph.txt
│ │ │ └── map.py
│ │ └── tts/
│ │ ├── GenerSpeech.py
│ │ ├── PortaSpeech.py
│ │ └── base_tts_infer.py
│ ├── modules/
│ │ ├── GenerSpeech/
│ │ │ ├── config/
│ │ │ │ └── generspeech.yaml
│ │ │ ├── model/
│ │ │ │ ├── generspeech.py
│ │ │ │ ├── glow_modules.py
│ │ │ │ ├── mixstyle.py
│ │ │ │ ├── prosody_util.py
│ │ │ │ └── wavenet.py
│ │ │ └── task/
│ │ │ ├── dataset.py
│ │ │ └── generspeech.py
│ │ ├── __init__.py
│ │ ├── commons/
│ │ │ ├── align_ops.py
│ │ │ ├── common_layers.py
│ │ │ ├── conv.py
│ │ │ ├── espnet_positional_embedding.py
│ │ │ ├── normalizing_flow/
│ │ │ │ ├── glow_modules.py
│ │ │ │ ├── res_flow.py
│ │ │ │ └── utils.py
│ │ │ ├── rel_transformer.py
│ │ │ ├── ssim.py
│ │ │ ├── transformer.py
│ │ │ └── wavenet.py
│ │ ├── diff/
│ │ │ ├── candidate_decoder.py
│ │ │ ├── diffusion.py
│ │ │ ├── net.py
│ │ │ └── shallow_diffusion_tts.py
│ │ ├── diffsinger_midi/
│ │ │ └── fs2.py
│ │ ├── fastspeech/
│ │ │ ├── fs2.py
│ │ │ ├── pe.py
│ │ │ └── tts_modules.py
│ │ ├── hifigan/
│ │ │ ├── hifigan.py
│ │ │ └── mel_utils.py
│ │ ├── parallel_wavegan/
│ │ │ ├── __init__.py
│ │ │ ├── layers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── causal_conv.py
│ │ │ │ ├── pqmf.py
│ │ │ │ ├── residual_block.py
│ │ │ │ ├── residual_stack.py
│ │ │ │ ├── tf_layers.py
│ │ │ │ └── upsample.py
│ │ │ ├── losses/
│ │ │ │ ├── __init__.py
│ │ │ │ └── stft_loss.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── melgan.py
│ │ │ │ ├── parallel_wavegan.py
│ │ │ │ └── source.py
│ │ │ ├── optimizers/
│ │ │ │ ├── __init__.py
│ │ │ │ └── radam.py
│ │ │ ├── stft_loss.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ └── syntaspeech/
│ │ ├── multi_window_disc.py
│ │ ├── syntactic_graph_buider.py
│ │ ├── syntactic_graph_encoder.py
│ │ └── syntaspeech.py
│ ├── tasks/
│ │ ├── base_task.py
│ │ ├── run.py
│ │ ├── svs/
│ │ │ ├── __init__.py
│ │ │ ├── diffsinger_task.py
│ │ │ ├── diffspeech_task.py
│ │ │ └── task.py
│ │ ├── tts/
│ │ │ ├── dataset_utils.py
│ │ │ ├── fs2.py
│ │ │ ├── fs2_adv.py
│ │ │ ├── fs2_utils.py
│ │ │ ├── pe.py
│ │ │ ├── ps.py
│ │ │ ├── ps_adv.py
│ │ │ ├── ps_flow.py
│ │ │ ├── synta.py
│ │ │ ├── tts.py
│ │ │ ├── tts_base.py
│ │ │ └── tts_utils.py
│ │ └── vocoder/
│ │ ├── dataset_utils.py
│ │ └── vocoder_base.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── ckpt_utils.py
│ │ ├── cwt.py
│ │ ├── dtw.py
│ │ ├── hparams.py
│ │ ├── indexed_datasets.py
│ │ ├── multiprocess_utils.py
│ │ ├── os_utils.py
│ │ ├── pitch_utils.py
│ │ ├── pl_utils.py
│ │ ├── plot.py
│ │ ├── text_encoder.py
│ │ ├── text_norm.py
│ │ ├── training_utils.py
│ │ └── tts_utils.py
│ └── vocoders/
│ ├── __init__.py
│ ├── base_vocoder.py
│ ├── hifigan.py
│ ├── pwg.py
│ └── vocoder_utils.py
├── README.md
├── assets/
│ └── README.md
├── audio-chatgpt.py
├── audio_detection/
│ ├── __init__.py
│ ├── audio_infer/
│ │ ├── __init__.py
│ │ ├── metadata/
│ │ │ ├── black_list/
│ │ │ │ ├── groundtruth_weak_label_evaluation_set.csv
│ │ │ │ └── groundtruth_weak_label_testing_set.csv
│ │ │ └── class_labels_indices.csv
│ │ ├── pytorch/
│ │ │ ├── evaluate.py
│ │ │ ├── finetune_template.py
│ │ │ ├── inference.py
│ │ │ ├── losses.py
│ │ │ ├── main.py
│ │ │ ├── models.py
│ │ │ └── pytorch_utils.py
│ │ └── utils/
│ │ ├── config.py
│ │ ├── crash.py
│ │ ├── create_black_list.py
│ │ ├── create_indexes.py
│ │ ├── data_generator.py
│ │ ├── dataset.py
│ │ ├── plot_for_paper.py
│ │ ├── plot_statistics.py
│ │ └── utilities.py
│ └── target_sound_detection/
│ └── src/
│ ├── models.py
│ └── utils.py
├── audio_to_text/
│ ├── __init__.py
│ ├── captioning/
│ │ ├── __init__.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── decoder.py
│ │ │ ├── encoder.py
│ │ │ ├── transformer_model.py
│ │ │ └── utils.py
│ │ └── utils/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── bert/
│ │ │ ├── create_sent_embedding.py
│ │ │ └── create_word_embedding.py
│ │ ├── build_vocab.py
│ │ ├── build_vocab_ltp.py
│ │ ├── build_vocab_spacy.py
│ │ ├── eval_round_robin.py
│ │ ├── fasttext/
│ │ │ └── create_word_embedding.py
│ │ ├── lr_scheduler.py
│ │ ├── model_eval_diff.py
│ │ ├── predict_nn.py
│ │ ├── remove_optimizer.py
│ │ ├── report_results.py
│ │ ├── tokenize_caption.py
│ │ ├── train_util.py
│ │ └── word2vec/
│ │ └── create_word_embedding.py
│ └── inference_waveform.py
├── download.sh
├── mono2binaural/
│ └── src/
│ ├── models.py
│ ├── utils.py
│ └── warping.py
├── requirements.txt
├── run.md
├── sound_extraction/
│ ├── model/
│ │ ├── LASSNet.py
│ │ ├── film.py
│ │ ├── modules.py
│ │ ├── resunet_film.py
│ │ └── text_encoder.py
│ └── utils/
│ ├── create_mixtures.py
│ ├── stft.py
│ └── wav_io.py
└── text_to_audio/
└── Make_An_Audio/
├── configs/
│ ├── img_to_audio/
│ │ └── img2audio_args.yaml
│ ├── inpaint/
│ │ └── txt2audio_args.yaml
│ └── text_to_audio/
│ ├── clap_args.yaml
│ ├── hifigan_args.yaml
│ └── txt2audio_args.yaml
├── ldm/
│ ├── data/
│ │ └── extract_mel_spectrogram.py
│ ├── lr_scheduler.py
│ ├── models/
│ │ ├── autoencoder.py
│ │ ├── autoencoder_multi.py
│ │ └── diffusion/
│ │ ├── __init__.py
│ │ ├── classifier.py
│ │ ├── ddim.py
│ │ ├── ddpm.py
│ │ ├── ddpm_audio.py
│ │ ├── ddpm_audio_inpaint.py
│ │ └── plms.py
│ ├── modules/
│ │ ├── attention.py
│ │ ├── diffusionmodules/
│ │ │ ├── __init__.py
│ │ │ ├── custom_openaimodel.py
│ │ │ ├── model.py
│ │ │ ├── openaimodel.py
│ │ │ └── util.py
│ │ ├── discriminator/
│ │ │ ├── model.py
│ │ │ └── multi_window_disc.py
│ │ ├── distributions/
│ │ │ ├── __init__.py
│ │ │ └── distributions.py
│ │ ├── ema.py
│ │ ├── encoders/
│ │ │ ├── CLAP/
│ │ │ │ ├── CLAPWrapper.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio.py
│ │ │ │ ├── clap.py
│ │ │ │ ├── config.yml
│ │ │ │ └── utils.py
│ │ │ ├── __init__.py
│ │ │ ├── modules.py
│ │ │ └── open_clap/
│ │ │ ├── __init__.py
│ │ │ ├── bert.py
│ │ │ ├── factory.py
│ │ │ ├── feature_fusion.py
│ │ │ ├── htsat.py
│ │ │ ├── linear_probe.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── model_configs/
│ │ │ │ ├── HTSAT-base.json
│ │ │ │ ├── HTSAT-large.json
│ │ │ │ ├── HTSAT-tiny-win-1536.json
│ │ │ │ ├── HTSAT-tiny.json
│ │ │ │ ├── PANN-10.json
│ │ │ │ ├── PANN-14-fmax-18k.json
│ │ │ │ ├── PANN-14-fmax-8k-20s.json
│ │ │ │ ├── PANN-14-tiny-transformer.json
│ │ │ │ ├── PANN-14-win-1536.json
│ │ │ │ ├── PANN-14.json
│ │ │ │ ├── PANN-6.json
│ │ │ │ ├── RN101-quickgelu.json
│ │ │ │ ├── RN101.json
│ │ │ │ ├── RN50-quickgelu.json
│ │ │ │ ├── RN50.json
│ │ │ │ ├── RN50x16.json
│ │ │ │ ├── RN50x4.json
│ │ │ │ ├── ViT-B-16.json
│ │ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ │ ├── ViT-B-32.json
│ │ │ │ └── ViT-L-14.json
│ │ │ ├── openai.py
│ │ │ ├── pann_model.py
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ ├── utils.py
│ │ │ └── version.py
│ │ ├── image_degradation/
│ │ │ ├── __init__.py
│ │ │ ├── bsrgan.py
│ │ │ ├── bsrgan_light.py
│ │ │ └── utils_image.py
│ │ ├── losses_audio/
│ │ │ ├── __init__.py
│ │ │ ├── contperceptual.py
│ │ │ ├── contperceptual_dis.py
│ │ │ ├── lpaps.py
│ │ │ ├── vggishish/
│ │ │ │ ├── config/
│ │ │ │ │ ├── melception.yaml
│ │ │ │ │ └── vggish.yaml
│ │ │ │ ├── data/
│ │ │ │ │ ├── train_means_stds_melspec_10s_22050hz.txt
│ │ │ │ │ ├── vggsound.csv
│ │ │ │ │ ├── vggsound_test.txt
│ │ │ │ │ ├── vggsound_train.txt
│ │ │ │ │ └── vggsound_valid.txt
│ │ │ │ ├── dataset.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── model.py
│ │ │ │ ├── predict.py
│ │ │ │ ├── train_melception.py
│ │ │ │ ├── train_vggishish.py
│ │ │ │ └── transforms.py
│ │ │ └── vqperceptual.py
│ │ └── x_transformer.py
│ └── util.py
├── useful_ckpts/
│ └── CLAP/
│ └── config.yml
├── vocoder/
│ ├── bigvgan/
│ │ ├── __init__.py
│ │ ├── activations.py
│ │ ├── alias_free_torch/
│ │ │ ├── __init__.py
│ │ │ ├── act.py
│ │ │ ├── filter.py
│ │ │ └── resample.py
│ │ └── models.py
│ ├── hifigan/
│ │ └── modules.py
│ └── logs/
│ └── hifi_0127/
│ └── args.yml
└── wav_evaluation/
└── models/
├── CLAPWrapper.py
├── __init__.py
├── audio.py
├── clap.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# JetBrains PyCharm IDE
.idea/
.github/
.circleci/
# Byte-compiled / optimized / DLL files
*__pycache__/
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# macOS dir files
.DS_Store
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Checkpoints
checkpoints
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# dotenv
.env
# virtualenv
.venv
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# Generated files
/fairseq/temporal_convolution_tbc
/fairseq/modules/*_layer/*_forward.cu
/fairseq/modules/*_layer/*_backward.cu
/fairseq/version.py
# data
data-bin/
# reranking
/examples/reranking/rerank_data
# Cython-generated C++ source files
/fairseq/data/data_utils_fast.cpp
/fairseq/data/token_block_utils_fast.cpp
# VSCODE
.vscode/ftp-sync.json
.vscode/settings.json
# Experimental Folder
experimental/*
# Weights and Biases logs
wandb/
# Hydra artifacts
nohup.out
multirun
outputs
================================================
FILE: LICENSE
================================================
================================================
FILE: NeuralSeq/LICENSE
================================================
MIT License
Copyright (c) 2021 Jinglin Liu
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: NeuralSeq/README.md
================================================
In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger
================================================
FILE: NeuralSeq/configs/config_base.yaml
================================================
# task
binary_data_dir: ''
work_dir: '' # experiment directory.
infer: false # infer
seed: 1234
debug: false
save_codes:
- configs
- modules
- tasks
- utils
- usr
#############
# dataset
#############
ds_workers: 1
test_num: 100
valid_num: 100
endless_ds: false
sort_by_len: true
#########
# train and eval
#########
load_ckpt: ''
save_ckpt: true
save_best: false
num_ckpt_keep: 3
clip_grad_norm: 0
accumulate_grad_batches: 1
log_interval: 100
num_sanity_val_steps: 5 # steps of validation at the beginning
check_val_every_n_epoch: 10
val_check_interval: 2000
max_epochs: 1000
max_updates: 160000
max_tokens: 31250
max_sentences: 100000
max_eval_tokens: -1
max_eval_sentences: -1
test_input_dir: ''
================================================
FILE: NeuralSeq/configs/singing/base.yaml
================================================
base_config:
- configs/tts/base.yaml
- configs/tts/base_zh.yaml
datasets: []
test_prefixes: []
test_num: 0
valid_num: 0
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
binarizer_cls: data_gen.singing.binarize.SingingBinarizer
pre_align_args:
use_tone: false # for ZH
forced_align: mfa
use_sox: true
hop_size: 128 # Hop size.
fft_size: 512 # FFT size.
win_size: 512 # FFT size.
max_frames: 8000
fmin: 50 # Minimum freq in mel basis calculation.
fmax: 11025 # Maximum frequency in mel basis calculation.
pitch_type: frame
hidden_size: 256
mel_loss: "ssim:0.5|l1:0.5"
lambda_f0: 0.0
lambda_uv: 0.0
lambda_energy: 0.0
lambda_ph_dur: 0.0
lambda_sent_dur: 0.0
lambda_word_dur: 0.0
predictor_grad: 0.0
use_spk_embed: true
use_spk_id: false
max_tokens: 20000
max_updates: 400000
num_spk: 100
save_f0: true
use_gt_dur: true
use_gt_f0: true
================================================
FILE: NeuralSeq/configs/singing/fs2.yaml
================================================
base_config:
- configs/tts/fs2.yaml
- configs/singing/base.yaml
================================================
FILE: NeuralSeq/configs/tts/base.yaml
================================================
# task
base_config: configs/config_base.yaml
task_cls: ''
#############
# dataset
#############
raw_data_dir: ''
processed_data_dir: ''
binary_data_dir: ''
dict_dir: ''
pre_align_cls: ''
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
pre_align_args:
use_tone: true # for ZH
forced_align: mfa
use_sox: false
txt_processor: en
allow_no_txt: false
denoise: false
binarization_args:
shuffle: false
with_txt: true
with_wav: false
with_align: true
with_spk_embed: true
with_f0: true
with_f0cwt: true
loud_norm: false
endless_ds: true
reset_phone_dict: true
test_num: 100
valid_num: 100
max_frames: 1550
max_input_tokens: 1550
audio_num_mel_bins: 80
audio_sample_rate: 22050
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600 # To be increased/reduced depending on data.
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
num_spk: 1
mel_vmin: -6
mel_vmax: 1.5
ds_workers: 4
#########
# model
#########
dropout: 0.1
enc_layers: 4
dec_layers: 4
hidden_size: 384
num_heads: 2
prenet_dropout: 0.5
prenet_hidden_size: 256
stop_token_weight: 5.0
enc_ffn_kernel_size: 9
dec_ffn_kernel_size: 9
ffn_act: gelu
ffn_padding: 'SAME'
###########
# optimization
###########
lr: 2.0
warmup_updates: 8000
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
weight_decay: 0
clip_grad_norm: 1
###########
# train and eval
###########
max_tokens: 30000
max_sentences: 100000
max_eval_sentences: 1
max_eval_tokens: 60000
train_set_name: 'train'
valid_set_name: 'valid'
test_set_name: 'test'
vocoder: pwg
vocoder_ckpt: ''
profile_infer: false
out_wav_norm: false
save_gt: false
save_f0: false
gen_dir_name: ''
use_denoise: false
================================================
FILE: NeuralSeq/configs/tts/base_zh.yaml
================================================
pre_align_args:
txt_processor: zh_g2pM
binarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer
================================================
FILE: NeuralSeq/configs/tts/emotion/base_text2mel.yaml
================================================
raw_data_dir: 'data/raw/ESD'
processed_data_dir: 'data/processed/emotion'
binary_data_dir: 'data/binary/emotion'
pre_align_cls: configs.tts.emotion.pre_align.EmoPreAlign
audio_sample_rate: 16000
binarization_args:
shuffle: true
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
use_spk_id: true
test_num: 200
num_spk: 10
pitch_type: frame
min_frames: 128
num_test_samples: 30
mel_loss: "ssim:0.5|l1:0.5"
vocoder_ckpt: ''
use_emotion: true
================================================
FILE: NeuralSeq/configs/tts/emotion/pre_align.py
================================================
import os
from data_gen.tts.base_preprocess import BasePreprocessor
import glob
import re
class EmoPreAlign(BasePreprocessor):
def meta_data(self):
spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
pattern = re.compile('[\t\n ]+')
for spk in spks:
for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件
line = re.sub(pattern, ' ', line)
if line == ' ': continue
split_ = line.split(' ')
txt = ' '.join(split_[1: -2])
item_name = split_[0]
emotion = split_[-2]
wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
yield item_name, wav_fn, txt, spk, emotion
if __name__ == "__main__":
EmoPreAlign().process()
================================================
FILE: NeuralSeq/configs/tts/fs2.yaml
================================================
base_config: configs/tts/base.yaml
task_cls: tasks.tts.fs2.FastSpeech2Task
# model
hidden_size: 256
dropout: 0.1
encoder_type: fft # fft|tacotron|tacotron2|conformer
encoder_K: 8 # for tacotron encoder
decoder_type: fft # fft|rnn|conv|conformer
use_pos_embed: true
# duration
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 2
dur_predictor_kernel: 3
dur_predictor_layers: 2
predictor_dropout: 0.5
# pitch and energy
use_pitch_embed: true
pitch_type: ph # frame|ph|cwt
use_uv: true
cwt_hidden_size: 128
cwt_layers: 2
cwt_loss: l1
cwt_add_f0_loss: false
cwt_std_scale: 0.8
pitch_ar: false
#pitch_embed_type: 0q
pitch_loss: 'l1' # l1|l2|ssim
pitch_norm: log
use_energy_embed: false
# reference encoder and speaker embedding
use_spk_id: false
use_split_spk_id: false
use_spk_embed: false
use_var_enc: false
lambda_commit: 0.25
ref_norm_layer: bn
pitch_enc_hidden_stride_kernel:
- 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
- 0,2,5
- 0,2,5
dur_enc_hidden_stride_kernel:
- 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
- 0,2,3
- 0,1,3
# mel
mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
# loss lambda
lambda_f0: 1.0
lambda_uv: 1.0
lambda_energy: 0.1
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
# train and eval
pretrain_fs_ckpt: ''
warmup_updates: 2000
max_tokens: 32000
max_sentences: 100000
max_eval_sentences: 1
max_updates: 120000
num_valid_plots: 5
num_test_samples: 0
test_ids: []
use_gt_dur: false
use_gt_f0: false
# exp
dur_loss: mse # huber|mol
norm_type: gn
================================================
FILE: NeuralSeq/configs/tts/hifigan.yaml
================================================
base_config: configs/tts/pwg.yaml
task_cls: tasks.vocoder.hifigan.HifiGanTask
resblock: "1"
adam_b1: 0.8
adam_b2: 0.99
upsample_rates: [ 8,8,2,2 ]
upsample_kernel_sizes: [ 16,16,4,4 ]
upsample_initial_channel: 128
resblock_kernel_sizes: [ 3,7,11 ]
resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
lambda_mel: 45.0
max_samples: 8192
max_sentences: 16
generator_params:
lr: 0.0002 # Generator's learning rate.
aux_context_window: 0 # Context window size for auxiliary feature.
discriminator_optimizer_params:
lr: 0.0002 # Discriminator's learning rate.
================================================
FILE: NeuralSeq/configs/tts/libritts/base_text2mel.yaml
================================================
raw_data_dir: 'data/raw/LibriTTS'
processed_data_dir: 'data/processed/libritts'
binary_data_dir: 'data/binary/libritts'
pre_align_cls: configs.tts.libritts.pre_align.LibrittsPreAlign
binarization_args:
shuffle: true
use_spk_id: true
test_num: 200
num_spk: 2320
pitch_type: frame
min_frames: 128
num_test_samples: 30
mel_loss: "ssim:0.5|l1:0.5"
vocoder_ckpt: ''
================================================
FILE: NeuralSeq/configs/tts/libritts/fs2.yaml
================================================
base_config:
- configs/tts/fs2.yaml
- ./base_text2mel.yaml
================================================
FILE: NeuralSeq/configs/tts/libritts/pre_align.py
================================================
import os
from data_gen.tts.base_preprocess import BasePreprocessor
import glob
class LibrittsPreAlign(BasePreprocessor):
def meta_data(self):
wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
for wav_fn in wav_fns:
item_name = os.path.basename(wav_fn)[:-4]
txt_fn = f'{wav_fn[:-4]}.normalized.txt'
with open(txt_fn, 'r') as f:
txt = f.readlines()
f.close()
spk = item_name.split("_")[0]
# Example:
#
# 'item_name': '103_1241_000000_000001'
# 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
# 'txt': 'matthew Cuthbert is surprised'
# 'spk_name': '103'
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
if __name__ == "__main__":
LibrittsPreAlign().process()
================================================
FILE: NeuralSeq/configs/tts/libritts/pwg.yaml
================================================
base_config: egs/egs_bases/tts/vocoder/pwg.yaml
raw_data_dir: 'data/raw/LibriTTS'
processed_data_dir: 'data/processed/libritts'
binary_data_dir: 'data/binary/libritts_wav'
generator_params:
kernel_size: 5
num_spk: 400
max_samples: 20480
================================================
FILE: NeuralSeq/configs/tts/lj/base_mel2wav.yaml
================================================
raw_data_dir: 'data/raw/LJSpeech-1.1'
processed_data_dir: 'data/processed/ljspeech'
binary_data_dir: 'data/binary/ljspeech_wav'
================================================
FILE: NeuralSeq/configs/tts/lj/base_text2mel.yaml
================================================
raw_data_dir: 'data/raw/LJSpeech-1.1'
processed_data_dir: 'data/processed/ljspeech'
binary_data_dir: 'data/binary/ljspeech'
pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
pitch_type: cwt
mel_loss: l1
num_test_samples: 20
test_ids: [ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
use_energy_embed: false
test_num: 523
valid_num: 348
================================================
FILE: NeuralSeq/configs/tts/lj/fs2.yaml
================================================
base_config:
- configs/tts/fs2.yaml
- configs/tts/lj/base_text2mel.yaml
================================================
FILE: NeuralSeq/configs/tts/lj/hifigan.yaml
================================================
base_config:
- configs/tts/hifigan.yaml
- configs/tts/lj/base_mel2wav.yaml
================================================
FILE: NeuralSeq/configs/tts/lj/pwg.yaml
================================================
base_config:
- configs/tts/pwg.yaml
- configs/tts/lj/base_mel2wav.yaml
================================================
FILE: NeuralSeq/configs/tts/pwg.yaml
================================================
base_config: configs/tts/base.yaml
task_cls: tasks.vocoder.pwg.PwgTask
binarization_args:
with_wav: true
with_spk_embed: false
with_align: false
test_input_dir: ''
###########
# train and eval
###########
max_samples: 25600
max_sentences: 5
max_eval_sentences: 1
max_updates: 1000000
val_check_interval: 2000
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
sampling_rate: 22050 # Sampling rate.
fft_size: 1024 # FFT size.
hop_size: 256 # Hop size.
win_length: null # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
num_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation.
fmax: 7600 # Maximum frequency in mel basis calculation.
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Kernel size of dilated convolution.
layers: 30 # Number of residual block layers.
stacks: 3 # Number of stacks i.e., dilation cycles.
residual_channels: 64 # Number of channels in residual conv.
gate_channels: 128 # Number of channels in gated conv.
skip_channels: 64 # Number of channels in skip conv.
aux_channels: 80 # Number of channels for auxiliary feature conv.
# Must be the same as num_mels.
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
upsample_params: # Upsampling network parameters.
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
use_pitch_embed: false
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
negative_slope: 0.2 # Alpha in LeakyReLU.
###########################################################
# STFT LOSS SETTING #
###########################################################
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
use_mel_loss: false
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
lambda_adv: 4.0 # Loss balancing coefficient.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params:
lr: 0.0001 # Generator's learning rate.
eps: 1.0e-6 # Generator's epsilon.
weight_decay: 0.0 # Generator's weight decay coefficient.
generator_scheduler_params:
step_size: 200000 # Generator's scheduler step size.
gamma: 0.5 # Generator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
generator_grad_norm: 10 # Generator's gradient norm.
discriminator_optimizer_params:
lr: 0.00005 # Discriminator's learning rate.
eps: 1.0e-6 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_scheduler_params:
step_size: 200000 # Discriminator's scheduler step size.
gamma: 0.5 # Discriminator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
discriminator_grad_norm: 1 # Discriminator's gradient norm.
disc_start_steps: 40000 # Number of steps to start to train discriminator.
================================================
FILE: NeuralSeq/data_gen/tts/base_binarizer.py
================================================
import os
os.environ["OMP_NUM_THREADS"] = "1"
from utils.multiprocess_utils import chunked_multiprocess_run
import random
import traceback
import json
from resemblyzer import VoiceEncoder
from tqdm import tqdm
from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder
from utils.hparams import set_hparams, hparams
import numpy as np
from utils.indexed_datasets import IndexedDatasetBuilder
from vocoders.base_vocoder import VOCODERS
import pandas as pd
class BinarizationError(Exception):
pass
class BaseBinarizer:
def __init__(self, processed_data_dir=None):
if processed_data_dir is None:
processed_data_dir = hparams['processed_data_dir']
self.processed_data_dirs = processed_data_dir.split(",")
self.binarization_args = hparams['binarization_args']
self.pre_align_args = hparams['pre_align_args']
self.forced_align = self.pre_align_args['forced_align']
tg_dir = None
if self.forced_align == 'mfa':
tg_dir = 'mfa_outputs'
if self.forced_align == 'kaldi':
tg_dir = 'kaldi_outputs'
self.item2txt = {}
self.item2ph = {}
self.item2wavfn = {}
self.item2tgfn = {}
self.item2spk = {}
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
for r_idx, r in self.meta_df.iterrows():
item_name = raw_item_name = r['item_name']
if len(self.processed_data_dirs) > 1:
item_name = f'ds{ds_id}_{item_name}'
self.item2txt[item_name] = r['txt']
self.item2ph[item_name] = r['ph']
self.item2wavfn[item_name] = os.path.join(hparams['raw_data_dir'], 'wavs', os.path.basename(r['wav_fn']).split('_')[1])
self.item2spk[item_name] = r.get('spk', 'SPK1')
if len(self.processed_data_dirs) > 1:
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
if tg_dir is not None:
self.item2tgfn[item_name] = f"{processed_data_dir}/{tg_dir}/{raw_item_name}.TextGrid"
self.item_names = sorted(list(self.item2txt.keys()))
if self.binarization_args['shuffle']:
random.seed(1234)
random.shuffle(self.item_names)
@property
def train_item_names(self):
return self.item_names[hparams['test_num']+hparams['valid_num']:]
@property
def valid_item_names(self):
return self.item_names[0: hparams['test_num']+hparams['valid_num']] #
@property
def test_item_names(self):
return self.item_names[0: hparams['test_num']] # Audios for MOS testing are in 'test_ids'
def build_spk_map(self):
spk_map = set()
for item_name in self.item_names:
spk_name = self.item2spk[item_name]
spk_map.add(spk_name)
spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
return spk_map
def item_name2spk_id(self, item_name):
return self.spk_map[self.item2spk[item_name]]
def _phone_encoder(self):
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
ph_set = []
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
for processed_data_dir in self.processed_data_dirs:
ph_set += [x.split(' ')[0] for x in open(f'{processed_data_dir}/dict.txt').readlines()]
ph_set = sorted(set(ph_set))
json.dump(ph_set, open(ph_set_fn, 'w'))
else:
ph_set = json.load(open(ph_set_fn, 'r'))
print("| phone set: ", ph_set)
return build_phone_encoder(hparams['binary_data_dir'])
def meta_data(self, prefix):
if prefix == 'valid':
item_names = self.valid_item_names
elif prefix == 'test':
item_names = self.test_item_names
else:
item_names = self.train_item_names
for item_name in item_names:
ph = self.item2ph[item_name]
txt = self.item2txt[item_name]
tg_fn = self.item2tgfn.get(item_name)
wav_fn = self.item2wavfn[item_name]
spk_id = self.item_name2spk_id(item_name)
yield item_name, ph, txt, tg_fn, wav_fn, spk_id
def process(self):
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
self.spk_map = self.build_spk_map()
print("| spk_map: ", self.spk_map)
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
json.dump(self.spk_map, open(spk_map_fn, 'w'))
self.phone_encoder = self._phone_encoder()
self.process_data('valid')
self.process_data('test')
self.process_data('train')
def process_data(self, prefix):
data_dir = hparams['binary_data_dir']
args = []
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
lengths = []
f0s = []
total_sec = 0
if self.binarization_args['with_spk_embed']:
voice_encoder = VoiceEncoder().cuda()
meta_data = list(self.meta_data(prefix))
for m in meta_data:
args.append(list(m) + [self.phone_encoder, self.binarization_args])
num_workers = int(os.getenv('N_PROC', os.cpu_count() // 3))
for f_id, (_, item) in enumerate(
zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
if item is None:
continue
item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
if self.binarization_args['with_spk_embed'] else None
if not self.binarization_args['with_wav'] and 'wav' in item:
print("del wav")
del item['wav']
builder.add_item(item)
lengths.append(item['len'])
total_sec += item['sec']
if item.get('f0') is not None:
f0s.append(item['f0'])
builder.finalize()
np.save(f'{data_dir}/{prefix}_lengths.npy', lengths)
if len(f0s) > 0:
f0s = np.concatenate(f0s, 0)
f0s = f0s[f0s != 0]
np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
print(f"| {prefix} total duration: {total_sec:.3f}s")
@classmethod
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
if hparams['vocoder'] in VOCODERS:
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
else:
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
res = {
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
}
try:
if binarization_args['with_f0']:
cls.get_pitch(wav, mel, res)
if binarization_args['with_f0cwt']:
cls.get_f0cwt(res['f0'], res)
if binarization_args['with_txt']:
try:
phone_encoded = res['phone'] = encoder.encode(ph)
except:
traceback.print_exc()
raise BinarizationError(f"Empty phoneme")
if binarization_args['with_align']:
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
except BinarizationError as e:
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
return None
return res
@staticmethod
def get_align(tg_fn, ph, mel, phone_encoded, res):
if tg_fn is not None and os.path.exists(tg_fn):
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
else:
raise BinarizationError(f"Align not found")
if mel2ph.max() - 1 >= len(phone_encoded):
raise BinarizationError(
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
res['mel2ph'] = mel2ph
res['dur'] = dur
@staticmethod
def get_pitch(wav, mel, res):
f0, pitch_coarse = get_pitch(wav, mel, hparams)
if sum(f0) == 0:
raise BinarizationError("Empty f0")
res['f0'] = f0
res['pitch'] = pitch_coarse
@staticmethod
def get_f0cwt(f0, res):
from utils.cwt import get_cont_lf0, get_lf0_cwt
uv, cont_lf0_lpf = get_cont_lf0(f0)
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
if np.any(np.isnan(Wavelet_lf0)):
raise BinarizationError("NaN CWT")
res['cwt_spec'] = Wavelet_lf0
res['cwt_scales'] = scales
res['f0_mean'] = logf0s_mean_org
res['f0_std'] = logf0s_std_org
if __name__ == "__main__":
set_hparams()
BaseBinarizer().process()
================================================
FILE: NeuralSeq/data_gen/tts/base_binarizer_emotion.py
================================================
import os
os.environ["OMP_NUM_THREADS"] = "1"
import torch
from collections import Counter
from utils.text_encoder import TokenTextEncoder
from data_gen.tts.emotion import inference as EmotionEncoder
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
from data_gen.tts.emotion.inference import preprocess_wav
from utils.multiprocess_utils import chunked_multiprocess_run
import random
import traceback
import json
from resemblyzer import VoiceEncoder
from tqdm import tqdm
from data_gen.tts.data_gen_utils import get_mel2ph, get_pitch, build_phone_encoder, is_sil_phoneme
from utils.hparams import hparams, set_hparams
import numpy as np
from utils.indexed_datasets import IndexedDatasetBuilder
from vocoders.base_vocoder import get_vocoder_cls
import pandas as pd
class BinarizationError(Exception):
pass
class EmotionBinarizer:
def __init__(self, processed_data_dir=None):
if processed_data_dir is None:
processed_data_dir = hparams['processed_data_dir']
self.processed_data_dirs = processed_data_dir.split(",")
self.binarization_args = hparams['binarization_args']
self.pre_align_args = hparams['pre_align_args']
self.item2txt = {}
self.item2ph = {}
self.item2wavfn = {}
self.item2tgfn = {}
self.item2spk = {}
self.item2emo = {}
def load_meta_data(self):
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
self.meta_df = pd.read_csv(f"{processed_data_dir}/metadata_phone.csv", dtype=str)
for r_idx, r in tqdm(self.meta_df.iterrows(), desc='Loading meta data.'):
item_name = raw_item_name = r['item_name']
if len(self.processed_data_dirs) > 1:
item_name = f'ds{ds_id}_{item_name}'
self.item2txt[item_name] = r['txt']
self.item2ph[item_name] = r['ph']
self.item2wavfn[item_name] = r['wav_fn']
self.item2spk[item_name] = r.get('spk_name', 'SPK1') \
if self.binarization_args['with_spk_id'] else 'SPK1'
if len(self.processed_data_dirs) > 1:
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
self.item2tgfn[item_name] = f"{processed_data_dir}/mfa_outputs/{raw_item_name}.TextGrid"
self.item2emo[item_name] = r.get('others', '"Neutral"')
self.item_names = sorted(list(self.item2txt.keys()))
if self.binarization_args['shuffle']:
random.seed(1234)
random.shuffle(self.item_names)
@property
def train_item_names(self):
return self.item_names[hparams['test_num']:]
@property
def valid_item_names(self):
return self.item_names[:hparams['test_num']]
@property
def test_item_names(self):
return self.valid_item_names
def build_spk_map(self):
spk_map = set()
for item_name in self.item_names:
spk_name = self.item2spk[item_name]
spk_map.add(spk_name)
spk_map = {x: i for i, x in enumerate(sorted(list(spk_map)))}
print("| #Spk: ", len(spk_map))
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
return spk_map
def build_emo_map(self):
emo_map = set()
for item_name in self.item_names:
emo_name = self.item2emo[item_name]
emo_map.add(emo_name)
emo_map = {x: i for i, x in enumerate(sorted(list(emo_map)))}
print("| #Emo: ", len(emo_map))
return emo_map
def item_name2spk_id(self, item_name):
return self.spk_map[self.item2spk[item_name]]
def item_name2emo_id(self, item_name):
return self.emo_map[self.item2emo[item_name]]
def _phone_encoder(self):
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
ph_set = []
if self.binarization_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
for ph_sent in self.item2ph.values():
ph_set += ph_sent.split(' ')
ph_set = sorted(set(ph_set))
json.dump(ph_set, open(ph_set_fn, 'w'))
print("| Build phone set: ", ph_set)
else:
ph_set = json.load(open(ph_set_fn, 'r'))
print("| Load phone set: ", ph_set)
return build_phone_encoder(hparams['binary_data_dir'])
def _word_encoder(self):
fn = f"{hparams['binary_data_dir']}/word_set.json"
word_set = []
if self.binarization_args['reset_word_dict']:
for word_sent in self.item2txt.values():
word_set += [x for x in word_sent.split(' ') if x != '']
word_set = Counter(word_set)
total_words = sum(word_set.values())
word_set = word_set.most_common(hparams['word_size'])
num_unk_words = total_words - sum([x[1] for x in word_set])
word_set = [x[0] for x in word_set]
json.dump(word_set, open(fn, 'w'))
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
else:
word_set = json.load(open(fn, 'r'))
print("| Load word set. Size: ", len(word_set), word_set[:10])
return TokenTextEncoder(None, vocab_list=word_set, replace_oov='<UNK>')
def meta_data(self, prefix):
if prefix == 'valid':
item_names = self.valid_item_names
elif prefix == 'test':
item_names = self.test_item_names
else:
item_names = self.train_item_names
for item_name in item_names:
ph = self.item2ph[item_name]
txt = self.item2txt[item_name]
tg_fn = self.item2tgfn.get(item_name)
wav_fn = self.item2wavfn[item_name]
spk_id = self.item_name2spk_id(item_name)
emotion = self.item_name2emo_id(item_name)
yield item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion
def process(self):
self.load_meta_data()
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
self.spk_map = self.build_spk_map()
print("| spk_map: ", self.spk_map)
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
json.dump(self.spk_map, open(spk_map_fn, 'w'))
self.emo_map = self.build_emo_map()
print("| emo_map: ", self.emo_map)
emo_map_fn = f"{hparams['binary_data_dir']}/emo_map.json"
json.dump(self.emo_map, open(emo_map_fn, 'w'))
self.phone_encoder = self._phone_encoder()
self.word_encoder = None
EmotionEncoder.load_model(hparams['emotion_encoder_path'])
if self.binarization_args['with_word']:
self.word_encoder = self._word_encoder()
self.process_data('valid')
self.process_data('test')
self.process_data('train')
def process_data(self, prefix):
data_dir = hparams['binary_data_dir']
args = []
builder = IndexedDatasetBuilder(f'{data_dir}/{prefix}')
ph_lengths = []
mel_lengths = []
f0s = []
total_sec = 0
if self.binarization_args['with_spk_embed']:
voice_encoder = VoiceEncoder().cuda()
meta_data = list(self.meta_data(prefix))
for m in meta_data:
args.append(list(m) + [(self.phone_encoder, self.word_encoder), self.binarization_args])
num_workers = self.num_workers
for f_id, (_, item) in enumerate(
zip(tqdm(meta_data), chunked_multiprocess_run(self.process_item, args, num_workers=num_workers))):
if item is None:
continue
item['spk_embed'] = voice_encoder.embed_utterance(item['wav']) \
if self.binarization_args['with_spk_embed'] else None
processed_wav = preprocess_wav(item['wav_fn'])
item['emo_embed'] = Embed_utterance(processed_wav)
if not self.binarization_args['with_wav'] and 'wav' in item:
del item['wav']
builder.add_item(item)
mel_lengths.append(item['len'])
if 'ph_len' in item:
ph_lengths.append(item['ph_len'])
total_sec += item['sec']
if item.get('f0') is not None:
f0s.append(item['f0'])
builder.finalize()
np.save(f'{data_dir}/{prefix}_lengths.npy', mel_lengths)
if len(ph_lengths) > 0:
np.save(f'{data_dir}/{prefix}_ph_lengths.npy', ph_lengths)
if len(f0s) > 0:
f0s = np.concatenate(f0s, 0)
f0s = f0s[f0s != 0]
np.save(f'{data_dir}/{prefix}_f0s_mean_std.npy', [np.mean(f0s).item(), np.std(f0s).item()])
print(f"| {prefix} total duration: {total_sec:.3f}s")
@classmethod
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emotion, encoder, binarization_args):
res = {'item_name': item_name, 'txt': txt, 'ph': ph, 'wav_fn': wav_fn, 'spk_id': spk_id, 'emotion': emotion}
if binarization_args['with_linear']:
wav, mel, linear_stft = get_vocoder_cls(hparams).wav2spec(wav_fn) # , return_linear=True
res['linear'] = linear_stft
else:
wav, mel = get_vocoder_cls(hparams).wav2spec(wav_fn)
wav = wav.astype(np.float16)
res.update({'mel': mel, 'wav': wav,
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]})
try:
if binarization_args['with_f0']:
cls.get_pitch(res)
if binarization_args['with_f0cwt']:
cls.get_f0cwt(res)
if binarization_args['with_txt']:
ph_encoder, word_encoder = encoder
try:
res['phone'] = ph_encoder.encode(ph)
res['ph_len'] = len(res['phone'])
except:
traceback.print_exc()
raise BinarizationError(f"Empty phoneme")
if binarization_args['with_align']:
cls.get_align(tg_fn, res)
if binarization_args['trim_eos_bos']:
bos_dur = res['dur'][0]
eos_dur = res['dur'][-1]
res['mel'] = mel[bos_dur:-eos_dur]
res['f0'] = res['f0'][bos_dur:-eos_dur]
res['pitch'] = res['pitch'][bos_dur:-eos_dur]
res['mel2ph'] = res['mel2ph'][bos_dur:-eos_dur]
res['wav'] = wav[bos_dur * hparams['hop_size']:-eos_dur * hparams['hop_size']]
res['dur'] = res['dur'][1:-1]
res['len'] = res['mel'].shape[0]
if binarization_args['with_word']:
cls.get_word(res, word_encoder)
except BinarizationError as e:
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
return None
except Exception as e:
traceback.print_exc()
print(f"| Skip item. item_name: {item_name}, wav_fn: {wav_fn}")
return None
return res
@staticmethod
def get_align(tg_fn, res):
ph = res['ph']
mel = res['mel']
phone_encoded = res['phone']
if tg_fn is not None and os.path.exists(tg_fn):
mel2ph, dur = get_mel2ph(tg_fn, ph, mel, hparams)
else:
raise BinarizationError(f"Align not found")
if mel2ph.max() - 1 >= len(phone_encoded):
raise BinarizationError(
f"Align does not match: mel2ph.max() - 1: {mel2ph.max() - 1}, len(phone_encoded): {len(phone_encoded)}")
res['mel2ph'] = mel2ph
res['dur'] = dur
@staticmethod
def get_pitch(res):
wav, mel = res['wav'], res['mel']
f0, pitch_coarse = get_pitch(wav, mel, hparams)
if sum(f0) == 0:
raise BinarizationError("Empty f0")
res['f0'] = f0
res['pitch'] = pitch_coarse
@staticmethod
def get_f0cwt(res):
from utils.cwt import get_cont_lf0, get_lf0_cwt
f0 = res['f0']
uv, cont_lf0_lpf = get_cont_lf0(f0)
logf0s_mean_org, logf0s_std_org = np.mean(cont_lf0_lpf), np.std(cont_lf0_lpf)
cont_lf0_lpf_norm = (cont_lf0_lpf - logf0s_mean_org) / logf0s_std_org
Wavelet_lf0, scales = get_lf0_cwt(cont_lf0_lpf_norm)
if np.any(np.isnan(Wavelet_lf0)):
raise BinarizationError("NaN CWT")
res['cwt_spec'] = Wavelet_lf0
res['cwt_scales'] = scales
res['f0_mean'] = logf0s_mean_org
res['f0_std'] = logf0s_std_org
@staticmethod
def get_word(res, word_encoder):
ph_split = res['ph'].split(" ")
# ph side mapping to word
ph_words = [] # ['<BOS>', 'N_AW1_', ',', 'AE1_Z_|', 'AO1_L_|', 'B_UH1_K_S_|', 'N_AA1_T_|', ....]
ph2word = np.zeros([len(ph_split)], dtype=int)
last_ph_idx_for_word = [] # [2, 11, ...]
for i, ph in enumerate(ph_split):
if ph == '|':
last_ph_idx_for_word.append(i)
elif not ph[0].isalnum():
if ph not in ['<BOS>']:
last_ph_idx_for_word.append(i - 1)
last_ph_idx_for_word.append(i)
start_ph_idx_for_word = [0] + [i + 1 for i in last_ph_idx_for_word[:-1]]
for i, (s_w, e_w) in enumerate(zip(start_ph_idx_for_word, last_ph_idx_for_word)):
ph_words.append(ph_split[s_w:e_w + 1])
ph2word[s_w:e_w + 1] = i
ph2word = ph2word.tolist()
ph_words = ["_".join(w) for w in ph_words]
# mel side mapping to word
mel2word = []
dur_word = [0 for _ in range(len(ph_words))]
for i, m2p in enumerate(res['mel2ph']):
word_idx = ph2word[m2p - 1]
mel2word.append(ph2word[m2p - 1])
dur_word[word_idx] += 1
ph2word = [x + 1 for x in ph2word] # 0预留给padding
mel2word = [x + 1 for x in mel2word] # 0预留给padding
res['ph_words'] = ph_words # [T_word]
res['ph2word'] = ph2word # [T_ph]
res['mel2word'] = mel2word # [T_mel]
res['dur_word'] = dur_word # [T_word]
words = [x for x in res['txt'].split(" ") if x != '']
while len(words) > 0 and is_sil_phoneme(words[0]):
words = words[1:]
while len(words) > 0 and is_sil_phoneme(words[-1]):
words = words[:-1]
words = ['<BOS>'] + words + ['<EOS>']
word_tokens = word_encoder.encode(" ".join(words))
res['words'] = words
res['word_tokens'] = word_tokens
assert len(words) == len(ph_words), [words, ph_words]
@property
def num_workers(self):
return int(os.getenv('N_PROC', hparams.get('N_PROC', os.cpu_count())))
if __name__ == "__main__":
set_hparams()
EmotionBinarizer().process()
================================================
FILE: NeuralSeq/data_gen/tts/base_preprocess.py
================================================
import json
import os
import random
import re
import traceback
from collections import Counter
from functools import partial
import pandas as pd
import librosa
from tqdm import tqdm
from data_gen.tts.txt_processors.base_text_processor import get_txt_processor_cls
from data_gen.tts.wav_processors.base_processor import get_wav_processor_cls
from utils.hparams import hparams
from utils.multiprocess_utils import multiprocess_run_tqdm
from utils.os_utils import link_file, move_file, remove_file
from data_gen.tts.data_gen_utils import is_sil_phoneme, build_token_encoder
class BasePreprocessor:
def __init__(self):
self.preprocess_args = hparams['preprocess_args']
txt_processor = self.preprocess_args['txt_processor']
self.txt_processor = get_txt_processor_cls(txt_processor)
self.raw_data_dir = hparams['raw_data_dir']
self.processed_dir = hparams['processed_data_dir']
self.spk_map_fn = f"{self.processed_dir}/spk_map.json"
def meta_data(self):
"""
:return: {'item_name': Str, 'wav_fn': Str, 'txt': Str, 'spk_name': Str, 'txt_loader': None or Func}
"""
raise NotImplementedError
def process(self):
processed_dir = self.processed_dir
wav_processed_tmp_dir = f'{processed_dir}/processed_tmp'
remove_file(wav_processed_tmp_dir)
os.makedirs(wav_processed_tmp_dir, exist_ok=True)
wav_processed_dir = f'{processed_dir}/{self.wav_processed_dirname}'
remove_file(wav_processed_dir)
os.makedirs(wav_processed_dir, exist_ok=True)
meta_data = list(tqdm(self.meta_data(), desc='Load meta data'))
item_names = [d['item_name'] for d in meta_data]
assert len(item_names) == len(set(item_names)), 'Key `item_name` should be Unique.'
# preprocess data
phone_list = []
word_list = []
spk_names = set()
process_item = partial(self.preprocess_first_pass,
txt_processor=self.txt_processor,
wav_processed_dir=wav_processed_dir,
wav_processed_tmp=wav_processed_tmp_dir,
preprocess_args=self.preprocess_args)
items = []
args = [{
'item_name': item_raw['item_name'],
'txt_raw': item_raw['txt'],
'wav_fn': item_raw['wav_fn'],
'txt_loader': item_raw.get('txt_loader'),
'others': item_raw.get('others', None)
} for item_raw in meta_data]
for item_, (item_id, item) in zip(meta_data, multiprocess_run_tqdm(process_item, args, desc='Preprocess')):
if item is not None:
item_.update(item)
item = item_
if 'txt_loader' in item:
del item['txt_loader']
item['id'] = item_id
item['spk_name'] = item.get('spk_name', '<SINGLE_SPK>')
item['others'] = item.get('others', None)
phone_list += item['ph'].split(" ")
word_list += item['word'].split(" ")
spk_names.add(item['spk_name'])
items.append(item)
# add encoded tokens
ph_encoder, word_encoder = self._phone_encoder(phone_list), self._word_encoder(word_list)
spk_map = self.build_spk_map(spk_names)
args = [{
'ph': item['ph'], 'word': item['word'], 'spk_name': item['spk_name'],
'word_encoder': word_encoder, 'ph_encoder': ph_encoder, 'spk_map': spk_map
} for item in items]
for idx, item_new_kv in multiprocess_run_tqdm(self.preprocess_second_pass, args, desc='Add encoded tokens'):
items[idx].update(item_new_kv)
# build mfa data
if self.preprocess_args['use_mfa']:
mfa_dict = set()
mfa_input_dir = f'{processed_dir}/mfa_inputs'
remove_file(mfa_input_dir)
# group MFA inputs for better parallelism
mfa_groups = [i // self.preprocess_args['nsample_per_mfa_group'] for i in range(len(items))]
if self.preprocess_args['mfa_group_shuffle']:
random.seed(hparams['seed'])
random.shuffle(mfa_groups)
args = [{
'item': item, 'mfa_input_dir': mfa_input_dir,
'mfa_group': mfa_group, 'wav_processed_tmp': wav_processed_tmp_dir,
'preprocess_args': self.preprocess_args
} for item, mfa_group in zip(items, mfa_groups)]
for i, (ph_gb_word_nosil, new_wav_align_fn) in multiprocess_run_tqdm(
self.build_mfa_inputs, args, desc='Build MFA data'):
items[i]['wav_align_fn'] = new_wav_align_fn
for w in ph_gb_word_nosil.split(" "):
mfa_dict.add(f"{w} {w.replace('_', ' ')}")
mfa_dict = sorted(mfa_dict)
with open(f'{processed_dir}/mfa_dict.txt', 'w') as f:
f.writelines([f'{l}\n' for l in mfa_dict])
with open(f"{processed_dir}/{self.meta_csv_filename}.json", 'w') as f:
f.write(re.sub(r'\n\s+([\d+\]])', r'\1', json.dumps(items, ensure_ascii=False, sort_keys=False, indent=1)))
remove_file(wav_processed_tmp_dir)
@classmethod
def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
wav_fn, wav_processed_dir, wav_processed_tmp,
preprocess_args, txt_loader=None, others=None):
try:
if txt_loader is not None:
txt_raw = txt_loader(txt_raw)
ph, txt, word, ph2word, ph_gb_word = cls.txt_to_ph(txt_processor, txt_raw, preprocess_args)
wav_fn, wav_align_fn = cls.process_wav(
item_name, wav_fn,
hparams['processed_data_dir'],
wav_processed_tmp, preprocess_args)
# wav for binarization
ext = os.path.splitext(wav_fn)[1]
os.makedirs(wav_processed_dir, exist_ok=True)
new_wav_fn = f"{wav_processed_dir}/{item_name}{ext}"
move_link_func = move_file if os.path.dirname(wav_fn) == wav_processed_tmp else link_file
move_link_func(wav_fn, new_wav_fn)
return {
'txt': txt, 'txt_raw': txt_raw, 'ph': ph,
'word': word, 'ph2word': ph2word, 'ph_gb_word': ph_gb_word,
'wav_fn': new_wav_fn, 'wav_align_fn': wav_align_fn,
'others': others
}
except:
traceback.print_exc()
print(f"| Error is caught. item_name: {item_name}.")
return None
@staticmethod
def txt_to_ph(txt_processor, txt_raw, preprocess_args):
txt_struct, txt = txt_processor.process(txt_raw, preprocess_args)
ph = [p for w in txt_struct for p in w[1]]
ph_gb_word = ["_".join(w[1]) for w in txt_struct]
words = [w[0] for w in txt_struct]
# word_id=0 is reserved for padding
ph2word = [w_id + 1 for w_id, w in enumerate(txt_struct) for _ in range(len(w[1]))]
return " ".join(ph), txt, " ".join(words), ph2word, " ".join(ph_gb_word)
@staticmethod
def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, preprocess_args):
processors = [get_wav_processor_cls(v) for v in preprocess_args['wav_processors']]
processors = [k() for k in processors if k is not None]
if len(processors) >= 1:
sr_file = librosa.core.get_samplerate(wav_fn)
output_fn_for_align = None
ext = os.path.splitext(wav_fn)[1]
input_fn = f"{wav_processed_tmp}/{item_name}{ext}"
link_file(wav_fn, input_fn)
for p in processors:
outputs = p.process(input_fn, sr_file, wav_processed_tmp, processed_dir, item_name, preprocess_args)
if len(outputs) == 3:
input_fn, sr, output_fn_for_align = outputs
else:
input_fn, sr = outputs
if output_fn_for_align is None:
return input_fn, input_fn
else:
return input_fn, output_fn_for_align
else:
return wav_fn, wav_fn
def _phone_encoder(self, ph_set):
ph_set_fn = f"{self.processed_dir}/phone_set.json"
if self.preprocess_args['reset_phone_dict'] or not os.path.exists(ph_set_fn):
ph_set = sorted(set(ph_set))
json.dump(ph_set, open(ph_set_fn, 'w'), ensure_ascii=False)
print("| Build phone set: ", ph_set)
else:
ph_set = json.load(open(ph_set_fn, 'r'))
print("| Load phone set: ", ph_set)
return build_token_encoder(ph_set_fn)
def _word_encoder(self, word_set):
word_set_fn = f"{self.processed_dir}/word_set.json"
if self.preprocess_args['reset_word_dict']:
word_set = Counter(word_set)
total_words = sum(word_set.values())
word_set = word_set.most_common(hparams['word_dict_size'])
num_unk_words = total_words - sum([x[1] for x in word_set])
word_set = ['<BOS>', '<EOS>'] + [x[0] for x in word_set]
word_set = sorted(set(word_set))
json.dump(word_set, open(word_set_fn, 'w'), ensure_ascii=False)
print(f"| Build word set. Size: {len(word_set)}, #total words: {total_words},"
f" #unk_words: {num_unk_words}, word_set[:10]:, {word_set[:10]}.")
else:
word_set = json.load(open(word_set_fn, 'r'))
print("| Load word set. Size: ", len(word_set), word_set[:10])
return build_token_encoder(word_set_fn)
@classmethod
def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_encoder, spk_map):
word_token = word_encoder.encode(word)
ph_token = ph_encoder.encode(ph)
spk_id = spk_map[spk_name]
return {'word_token': word_token, 'ph_token': ph_token, 'spk_id': spk_id}
def build_spk_map(self, spk_names):
spk_map = {x: i for i, x in enumerate(sorted(list(spk_names)))}
assert len(spk_map) == 0 or len(spk_map) <= hparams['num_spk'], len(spk_map)
print(f"| Number of spks: {len(spk_map)}, spk_map: {spk_map}")
json.dump(spk_map, open(self.spk_map_fn, 'w'), ensure_ascii=False)
return spk_map
@classmethod
def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processed_tmp, preprocess_args):
item_name = item['item_name']
wav_align_fn = item['wav_align_fn']
ph_gb_word = item['ph_gb_word']
ext = os.path.splitext(wav_align_fn)[1]
mfa_input_group_dir = f'{mfa_input_dir}/{mfa_group}'
os.makedirs(mfa_input_group_dir, exist_ok=True)
new_wav_align_fn = f"{mfa_input_group_dir}/{item_name}{ext}"
move_link_func = move_file if os.path.dirname(wav_align_fn) == wav_processed_tmp else link_file
move_link_func(wav_align_fn, new_wav_align_fn)
ph_gb_word_nosil = " ".join(["_".join([p for p in w.split("_") if not is_sil_phoneme(p)])
for w in ph_gb_word.split(" ") if not is_sil_phoneme(w)])
with open(f'{mfa_input_group_dir}/{item_name}.lab', 'w') as f_txt:
f_txt.write(ph_gb_word_nosil)
return ph_gb_word_nosil, new_wav_align_fn
def load_spk_map(self, base_dir):
spk_map_fn = f"{base_dir}/spk_map.json"
spk_map = json.load(open(spk_map_fn, 'r'))
return spk_map
def load_dict(self, base_dir):
ph_encoder = build_token_encoder(f'{base_dir}/phone_set.json')
word_encoder = build_token_encoder(f'{base_dir}/word_set.json')
return ph_encoder, word_encoder
@property
def meta_csv_filename(self):
return 'metadata'
@property
def wav_processed_dirname(self):
return 'wav_processed'
================================================
FILE: NeuralSeq/data_gen/tts/binarizer_zh.py
================================================
import os
os.environ["OMP_NUM_THREADS"] = "1"
from data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU
from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
from data_gen.tts.data_gen_utils import get_mel2ph
from utils.hparams import set_hparams, hparams
import numpy as np
class ZhBinarizer(BaseBinarizer):
@staticmethod
def get_align(tg_fn, ph, mel, phone_encoded, res):
if tg_fn is not None and os.path.exists(tg_fn):
_, dur = get_mel2ph(tg_fn, ph, mel, hparams)
else:
raise BinarizationError(f"Align not found")
ph_list = ph.split(" ")
assert len(dur) == len(ph_list)
mel2ph = []
# 分隔符的时长分配给韵母
dur_cumsum = np.pad(np.cumsum(dur), [1, 0], mode='constant', constant_values=0)
for i in range(len(dur)):
p = ph_list[i]
if p[0] != '<' and not p[0].isalpha():
uv_ = res['f0'][dur_cumsum[i]:dur_cumsum[i + 1]] == 0
j = 0
while j < len(uv_) and not uv_[j]:
j += 1
dur[i - 1] += j
dur[i] -= j
if dur[i] < 100:
dur[i - 1] += dur[i]
dur[i] = 0
# 声母和韵母等长
for i in range(len(dur)):
p = ph_list[i]
if p in ALL_SHENMU:
p_next = ph_list[i + 1]
if not (dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU):
print(f"assert dur[i] > 0 and p_next[0].isalpha() and p_next not in ALL_SHENMU, "
f"dur[i]: {dur[i]}, p: {p}, p_next: {p_next}.")
continue
total = dur[i + 1] + dur[i]
dur[i] = total // 2
dur[i + 1] = total - dur[i]
for i in range(len(dur)):
mel2ph += [i + 1] * dur[i]
mel2ph = np.array(mel2ph)
if mel2ph.max() - 1 >= len(phone_encoded):
raise BinarizationError(f"| Align does not match: {(mel2ph.max() - 1, len(phone_encoded))}")
res['mel2ph'] = mel2ph
res['dur'] = dur
if __name__ == "__main__":
set_hparams()
ZhBinarizer().process()
================================================
FILE: NeuralSeq/data_gen/tts/data_gen_utils.py
================================================
import warnings
warnings.filterwarnings("ignore")
import parselmouth
import os
import torch
from skimage.transform import resize
from utils.text_encoder import TokenTextEncoder
from utils.pitch_utils import f0_to_coarse
import struct
import webrtcvad
from scipy.ndimage.morphology import binary_dilation
import librosa
import numpy as np
from utils import audio
import pyloudnorm as pyln
import re
import json
from collections import OrderedDict
PUNCS = '!,.?;:'
int16_max = (2 ** 15) - 1
def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wav: the raw waveform as a numpy array of floats
:param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have.
:return: the same waveform with silences trimmed away (length <= original wav length)
"""
## Voice Activation Detection
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
# This sets the granularity of the VAD. Should not need to be changed.
sampling_rate = 16000
wav_raw, sr = librosa.core.load(path, sr=sr)
if norm:
meter = pyln.Meter(sr) # create BS.1770 meter
loudness = meter.integrated_loudness(wav_raw)
wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0)
if np.abs(wav_raw).max() > 1.0:
wav_raw = wav_raw / np.abs(wav_raw).max()
wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best')
vad_window_length = 30 # In milliseconds
# Number of frames to average together when performing the moving average smoothing.
# The larger this value, the larger the VAD variations must be to not get smoothed out.
vad_moving_average_width = 8
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000
# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)
# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)
audio_mask = resize(audio_mask, (len(wav_raw),)) > 0
if return_raw_wav:
return wav_raw, audio_mask, sr
return wav_raw[audio_mask], audio_mask, sr
def process_utterance(wav_path,
fft_size=1024,
hop_size=256,
win_length=1024,
window="hann",
num_mels=80,
fmin=80,
fmax=7600,
eps=1e-6,
sample_rate=22050,
loud_norm=False,
min_level_db=-100,
return_linear=False,
trim_long_sil=False, vocoder='pwg'):
if isinstance(wav_path, str):
if trim_long_sil:
wav, _, _ = trim_long_silences(wav_path, sample_rate)
else:
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
else:
wav = wav_path
if loud_norm:
meter = pyln.Meter(sample_rate) # create BS.1770 meter
loudness = meter.integrated_loudness(wav)
wav = pyln.normalize.loudness(wav, loudness, -22.0)
if np.abs(wav).max() > 1:
wav = wav / np.abs(wav).max()
# get amplitude spectrogram
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
win_length=win_length, window=window, pad_mode="constant")
spc = np.abs(x_stft) # (n_bins, T)
# get mel basis
fmin = 0 if fmin == -1 else fmin
fmax = sample_rate / 2 if fmax == -1 else fmax
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
mel = mel_basis @ spc
if vocoder == 'pwg':
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
else:
assert False, f'"{vocoder}" is not in ["pwg"].'
l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
wav = wav[:mel.shape[1] * hop_size]
if not return_linear:
return wav, mel
else:
spc = audio.amp_to_db(spc)
spc = audio.normalize(spc, {'min_level_db': min_level_db})
return wav, mel, spc
def get_pitch(wav_data, mel, hparams):
"""
:param wav_data: [T]
:param mel: [T, 80]
:param hparams:
:return:
"""
time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
f0_min = 80
f0_max = 750
if hparams['hop_size'] == 128:
pad_size = 4
elif hparams['hop_size'] == 256:
pad_size = 2
else:
assert False
f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=0.6,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
lpad = pad_size * 2
rpad = len(mel) - len(f0) - lpad
f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
# mel and f0 are extracted by 2 different libraries. we should force them to have the same length.
# Attention: we find that new version of some libraries could cause ``rpad'' to be a negetive value...
# Just to be sure, we recommend users to set up the same environments as them in requirements_auto.txt (by Anaconda)
delta_l = len(mel) - len(f0)
assert np.abs(delta_l) <= 8
if delta_l > 0:
f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
f0 = f0[:len(mel)]
pitch_coarse = f0_to_coarse(f0)
return f0, pitch_coarse
def remove_empty_lines(text):
"""remove empty lines"""
assert (len(text) > 0)
assert (isinstance(text, list))
text = [t.strip() for t in text]
if "" in text:
text.remove("")
return text
class TextGrid(object):
def __init__(self, text):
text = remove_empty_lines(text)
self.text = text
self.line_count = 0
self._get_type()
self._get_time_intval()
self._get_size()
self.tier_list = []
self._get_item_list()
def _extract_pattern(self, pattern, inc):
"""
Parameters
----------
pattern : regex to extract pattern
inc : increment of line count after extraction
Returns
-------
group : extracted info
"""
try:
group = re.match(pattern, self.text[self.line_count]).group(1)
self.line_count += inc
except AttributeError:
raise ValueError("File format error at line %d:%s" % (self.line_count, self.text[self.line_count]))
return group
def _get_type(self):
self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
def _get_time_intval(self):
self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
def _get_size(self):
self.size = int(self._extract_pattern(r"size = (.*)", 2))
def _get_item_list(self):
"""Only supports IntervalTier currently"""
for itemIdx in range(1, self.size + 1):
tier = OrderedDict()
item_list = []
tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
if tier_class != "IntervalTier":
raise NotImplementedError("Only IntervalTier class is supported currently")
tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
for i in range(int(tier_size)):
item = OrderedDict()
item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
item_list.append(item)
tier["idx"] = tier_idx
tier["class"] = tier_class
tier["name"] = tier_name
tier["xmin"] = tier_xmin
tier["xmax"] = tier_xmax
tier["size"] = tier_size
tier["items"] = item_list
self.tier_list.append(tier)
def toJson(self):
_json = OrderedDict()
_json["file_type"] = self.file_type
_json["xmin"] = self.xmin
_json["xmax"] = self.xmax
_json["size"] = self.size
_json["tiers"] = self.tier_list
return json.dumps(_json, ensure_ascii=False, indent=2)
def get_mel2ph(tg_fn, ph, mel, hparams):
ph_list = ph.split(" ")
with open(tg_fn, "r") as f:
tg = f.readlines()
tg = remove_empty_lines(tg)
tg = TextGrid(tg)
tg = json.loads(tg.toJson())
split = np.ones(len(ph_list) + 1, np.float) * -1
tg_idx = 0
ph_idx = 0
tg_align = [x for x in tg['tiers'][-1]['items']]
tg_align_ = []
for x in tg_align:
x['xmin'] = float(x['xmin'])
x['xmax'] = float(x['xmax'])
if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
x['text'] = ''
if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
tg_align_[-1]['xmax'] = x['xmax']
continue
tg_align_.append(x)
tg_align = tg_align_
tg_len = len([x for x in tg_align if x['text'] != ''])
ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, tg_fn)
while tg_idx < len(tg_align) or ph_idx < len(ph_list):
if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
split[ph_idx] = 1e8
ph_idx += 1
continue
x = tg_align[tg_idx]
if x['text'] == '' and ph_idx == len(ph_list):
tg_idx += 1
continue
assert ph_idx < len(ph_list), (tg_len, ph_len, tg_align, ph_list, tg_fn)
ph = ph_list[ph_idx]
if x['text'] == '' and not is_sil_phoneme(ph):
assert False, (ph_list, tg_align)
if x['text'] != '' and is_sil_phoneme(ph):
ph_idx += 1
else:
assert (x['text'] == '' and is_sil_phoneme(ph)) \
or x['text'].lower() == ph.lower() \
or x['text'].lower() == 'sil', (x['text'], ph)
split[ph_idx] = x['xmin']
if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(ph_list[ph_idx - 1]):
split[ph_idx - 1] = split[ph_idx]
ph_idx += 1
tg_idx += 1
assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
assert ph_idx >= len(ph_list) - 1, (ph_idx, ph_list, len(ph_list), [x['text'] for x in tg_align], tg_fn)
mel2ph = np.zeros([mel.shape[0]], np.int)
split[0] = 0
split[-1] = 1e8
for i in range(len(split) - 1):
assert split[i] != -1 and split[i] <= split[i + 1], (split[:-1],)
split = [int(s * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5) for s in split]
for ph_idx in range(len(ph_list)):
mel2ph[split[ph_idx]:split[ph_idx + 1]] = ph_idx + 1
mel2ph_torch = torch.from_numpy(mel2ph)
T_t = len(ph_list)
dur = mel2ph_torch.new_zeros([T_t + 1]).scatter_add(0, mel2ph_torch, torch.ones_like(mel2ph_torch))
dur = dur[1:].numpy()
return mel2ph, dur
def build_phone_encoder(data_dir):
phone_list_file = os.path.join(data_dir, 'phone_set.json')
phone_list = json.load(open(phone_list_file))
return TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
def build_word_encoder(data_dir):
word_list_file = os.path.join(data_dir, 'word_set.json')
word_list = json.load(open(word_list_file))
return TokenTextEncoder(None, vocab_list=word_list, replace_oov=',')
def is_sil_phoneme(p):
return not p[0].isalpha()
def build_token_encoder(token_list_file):
token_list = json.load(open(token_list_file))
return TokenTextEncoder(None, vocab_list=token_list, replace_oov='<UNK>')
================================================
FILE: NeuralSeq/data_gen/tts/emotion/audio.py
================================================
from scipy.ndimage.morphology import binary_dilation
from data_gen.tts.emotion.params_data import *
from pathlib import Path
from typing import Optional, Union
import numpy as np
import webrtcvad
import librosa
import struct
int16_max = (2 ** 15) - 1
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
source_sr: Optional[int] = None):
"""
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
just .wav), either the waveform as a numpy array of floats.
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
preprocessing. After preprocessing, the waveform's sampling rate will match the data
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
this argument will be ignored.
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
else:
wav = fpath_or_wav
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
wav = trim_long_silences(wav)
return wav
def wav_to_mel_spectrogram(wav):
"""
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
Note: this not a log-mel spectrogram.
"""
frames = librosa.feature.melspectrogram(
wav,
sampling_rate,
n_fft=int(sampling_rate * mel_window_length / 1000),
hop_length=int(sampling_rate * mel_window_step / 1000),
n_mels=mel_n_channels
)
return frames.astype(np.float32).T
def trim_long_silences(wav):
"""
Ensures that segments without voice in the waveform remain no longer than a
threshold determined by the VAD parameters in params.py.
:param wav: the raw waveform as a numpy array of floats
:return: the same waveform with silences trimmed away (length <= original wav length)
"""
# Compute the voice detection window size
samples_per_window = (vad_window_length * sampling_rate) // 1000
# Trim the end of the audio to have a multiple of the window size
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
# Convert the float waveform to 16-bit mono PCM
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
# Perform voice activation detection
voice_flags = []
vad = webrtcvad.Vad(mode=3)
for window_start in range(0, len(wav), samples_per_window):
window_end = window_start + samples_per_window
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
sample_rate=sampling_rate))
voice_flags = np.array(voice_flags)
# Smooth the voice detection with a moving average
def moving_average(array, width):
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
ret = np.cumsum(array_padded, dtype=float)
ret[width:] = ret[width:] - ret[:-width]
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
audio_mask = np.repeat(audio_mask, samples_per_window)
return wav[audio_mask == True]
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
if increase_only and decrease_only:
raise ValueError("Both increase only and decrease only are set")
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
return wav
return wav * (10 ** (dBFS_change / 20))
================================================
FILE: NeuralSeq/data_gen/tts/emotion/inference.py
================================================
from data_gen.tts.emotion.params_data import *
from data_gen.tts.emotion.model import EmotionEncoder
from data_gen.tts.emotion.audio import preprocess_wav # We want to expose this function from here
from matplotlib import cm
from data_gen.tts.emotion import audio
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import torch
_model = None # type: EmotionEncoder
_device = None # type: torch.device
def load_model(weights_fpath: Path, device=None):
"""
Loads the model in memory. If this function is not explicitely called, it will be run on the
first call to embed_frames() with the default weights file.
:param weights_fpath: the path to saved model weights.
:param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
model will be loaded and will run on this device. Outputs will however always be on the cpu.
If None, will default to your GPU if it"s available, otherwise your CPU.
"""
# TODO: I think the slow loading of the encoder might have something to do with the device it
# was saved on. Worth investigating.
global _model, _device
if device is None:
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif isinstance(device, str):
_device = torch.device(device)
_model = EmotionEncoder(_device, torch.device("cpu"))
checkpoint = torch.load(weights_fpath)
_model.load_state_dict(checkpoint["model_state"])
_model.eval()
print("Loaded encoder trained to step %d" % (checkpoint["step"]))
def is_loaded():
return _model is not None
def embed_frames_batch(frames_batch):
"""
Computes embeddings for a batch of mel spectrogram.
:param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
(batch_size, n_frames, n_channels)
:return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
"""
if _model is None:
raise Exception("Model was not loaded. Call load_model() before inference.")
frames = torch.from_numpy(frames_batch).to(_device)
embed = _model.inference(frames).detach().cpu().numpy()
return embed
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
min_pad_coverage=0.75, overlap=0.5):
"""
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
its spectrogram. This function assumes that the mel spectrogram parameters used are those
defined in params_data.py.
The returned ranges may be indexing further than the length of the waveform. It is
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
:param n_samples: the number of samples in the waveform
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
utterance
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
utterance, this parameter is ignored so that the function always returns at least 1 slice.
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
utterances are entirely disjoint.
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
respectively the waveform and the mel spectrogram with these slices to obtain the partial
utterances.
"""
assert 0 <= overlap < 1
assert 0 < min_pad_coverage <= 1
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
# Compute the slices
wav_slices, mel_slices = [], []
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
for i in range(0, steps, frame_step):
mel_range = np.array([i, i + partial_utterance_n_frames])
wav_range = mel_range * samples_per_frame
mel_slices.append(slice(*mel_range))
wav_slices.append(slice(*wav_range))
# Evaluate whether extra padding is warranted or not
last_wav_range = wav_slices[-1]
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
if coverage < min_pad_coverage and len(mel_slices) > 1:
mel_slices = mel_slices[:-1]
wav_slices = wav_slices[:-1]
return wav_slices, mel_slices
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
"""
Computes an embedding for a single utterance.
# TODO: handle multiple wavs to benefit from batching on GPU
:param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
:param using_partials: if True, then the utterance is split in partial utterances of
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
normalized average. If False, the utterance is instead computed from feeding the entire
spectogram to the network.
:param return_partials: if True, the partial embeddings will also be returned along with the
wav slices that correspond to the partial embeddings.
:param kwargs: additional arguments to compute_partial_splits()
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
returned. If <using_partials> is simultaneously set to False, both these values will be None
instead.
"""
# Process the entire utterance if not using partials
if not using_partials:
frames = audio.wav_to_mel_spectrogram(wav)
embed = embed_frames_batch(frames[None, ...])[0]
if return_partials:
return embed, None, None
return embed
# Compute where to split the utterance into partials and pad if necessary
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
max_wave_length = wave_slices[-1].stop
if max_wave_length >= len(wav):
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
# Split the utterance into partials
frames = audio.wav_to_mel_spectrogram(wav)
frames_batch = np.array([frames[s] for s in mel_slices])
partial_embeds = embed_frames_batch(frames_batch)
# Compute the utterance embedding from the partial embeddings
raw_embed = np.mean(partial_embeds, axis=0)
embed = raw_embed / np.linalg.norm(raw_embed, 2)
if return_partials:
return embed, partial_embeds, wave_slices
return embed
def embed_speaker(wavs, **kwargs):
raise NotImplemented()
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
if ax is None:
ax = plt.gca()
if shape is None:
height = int(np.sqrt(len(embed)))
shape = (height, -1)
embed = embed.reshape(shape)
cmap = cm.get_cmap()
mappable = ax.imshow(embed, cmap=cmap)
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
cbar.set_clim(*color_range)
ax.set_xticks([]), ax.set_yticks([])
ax.set_title(title)
================================================
FILE: NeuralSeq/data_gen/tts/emotion/model.py
================================================
from data_gen.tts.emotion.params_model import *
from data_gen.tts.emotion.params_data import *
from torch.nn.utils import clip_grad_norm_
from scipy.optimize import brentq
from torch import nn
import numpy as np
import torch
class EmotionEncoder(nn.Module):
def __init__(self, device, loss_device):
super().__init__()
self.loss_device = loss_device
# Network defition
self.lstm = nn.LSTM(input_size=mel_n_channels,
hidden_size=model_hidden_size,
num_layers=model_num_layers,
batch_first=True).to(device)
self.linear = nn.Linear(in_features=model_hidden_size,
out_features=model_embedding_size).to(device)
self.relu = torch.nn.ReLU().to(device)
# Cosine similarity scaling (with fixed initial parameter values)
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
# Loss
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
def do_gradient_ops(self):
# Gradient scale
self.similarity_weight.grad *= 0.01
self.similarity_bias.grad *= 0.01
# Gradient clipping
clip_grad_norm_(self.parameters(), 3, norm_type=2)
def forward(self, utterances, hidden_init=None):
"""
Computes the embeddings of a batch of utterance spectrograms.
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
(batch_size, n_frames, n_channels)
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
batch_size, hidden_size). Will default to a tensor of zeros if None.
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
"""
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
# and the final cell state.
out, (hidden, cell) = self.lstm(utterances, hidden_init)
# We take only the hidden state of the last layer
embeds_raw = self.relu(self.linear(hidden[-1]))
# L2-normalize it
embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
return embeds
def inference(self, utterances, hidden_init=None):
"""
Computes the embeddings of a batch of utterance spectrograms.
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
(batch_size, n_frames, n_channels)
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
batch_size, hidden_size). Will default to a tensor of zeros if None.
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
"""
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
# and the final cell state.
out, (hidden, cell) = self.lstm(utterances, hidden_init)
return hidden[-1]
================================================
FILE: NeuralSeq/data_gen/tts/emotion/params_data.py
================================================
## Mel-filterbank
mel_window_length = 25 # In milliseconds
mel_window_step = 10 # In milliseconds
mel_n_channels = 40
## Audio
sampling_rate = 16000
# Number of spectrogram frames in a partial utterance
partials_n_frames = 160 # 1600 ms
# Number of spectrogram frames at inference
inference_n_frames = 80 # 800 ms
## Voice Activation Detection
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
# This sets the granularity of the VAD. Should not need to be changed.
vad_window_length = 30 # In milliseconds
# Number of frames to average together when performing the moving average smoothing.
# The larger this value, the larger the VAD variations must be to not get smoothed out.
vad_moving_average_width = 8
# Maximum number of consecutive silent frames a segment can have.
vad_max_silence_length = 6
## Audio volume normalization
audio_norm_target_dBFS = -30
================================================
FILE: NeuralSeq/data_gen/tts/emotion/params_model.py
================================================
## Model parameters
model_hidden_size = 256
model_embedding_size = 256
model_num_layers = 3
## Training parameters
learning_rate_init = 1e-4
speakers_per_batch = 6
utterances_per_speaker = 20
================================================
FILE: NeuralSeq/data_gen/tts/emotion/test_emotion.py
================================================
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Run inference for pre-processed data with a trained model.
"""
import logging
import math
import numpy, math, pdb, sys, random
import time, os, itertools, shutil, importlib
import argparse
import os
import sys
import glob
from sklearn import metrics
import soundfile as sf
#import sentencepiece as spm
import torch
import inference as encoder
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
from resemblyzer import VoiceEncoder, preprocess_wav
def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
fnr = 1 - tpr
fnr = fnr * 100
fpr = fpr * 100
tunedThreshold = [];
if target_fr:
for tfr in target_fr:
idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
for tfa in target_fa:
idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]]);
idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
eer = max(fpr[idxE], fnr[idxE])
return (tunedThreshold, eer, fpr, fnr);
def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
# Maximum audio length
max_audio = max_frames * 160 + 240
# Read wav file and convert to torch tensor
audio,sample_rate = sf.read(filename)
feats_v0 = torch.from_numpy(audio).float()
audiosize = audio.shape[0]
if audiosize <= max_audio:
shortage = math.floor((max_audio - audiosize + 1) / 2)
audio = numpy.pad(audio, (shortage, shortage), 'constant', constant_values=0)
audiosize = audio.shape[0]
if evalmode:
startframe = numpy.linspace(0, audiosize - max_audio, num=num_eval)
else:
startframe = numpy.array([numpy.int64(random.random() * (audiosize - max_audio))])
feats = []
if evalmode and max_frames == 0:
feats.append(audio)
else:
for asf in startframe:
feats.append(audio[int(asf):int(asf) + max_audio])
feat = numpy.stack(feats, axis=0)
feat = torch.FloatTensor(feat)
return feat;
def evaluateFromList(listfilename, print_interval=100, test_path='', multi=False):
lines = []
files = []
feats = {}
tstart = time.time()
## Read all lines
with open(listfilename) as listfile:
while True:
line = listfile.readline();
if (not line):
break;
data = line.split();
## Append random label if missing
if len(data) == 2: data = [random.randint(0,1)] + data
files.append(data[1])
files.append(data[2])
lines.append(line)
setfiles = list(set(files))
setfiles.sort()
## Save all features to file
for idx, file in enumerate(setfiles):
# preprocessed_wav = encoder.preprocess_wav(os.path.join(test_path,file))
# embed = encoder.embed_utterance(preprocessed_wav)
processed_wav = preprocess_wav(os.path.join(test_path,file))
embed = voice_encoder.embed_utterance(processed_wav)
torch.cuda.empty_cache()
ref_feat = torch.from_numpy(embed).unsqueeze(0)
feats[file] = ref_feat
telapsed = time.time() - tstart
if idx % print_interval == 0:
sys.stdout.write("\rReading %d of %d: %.2f Hz, embedding size %d"%(idx,len(setfiles),idx/telapsed,ref_feat.size()[1]));
print('')
all_scores = [];
all_labels = [];
all_trials = [];
tstart = time.time()
## Read files and compute all scores
for idx, line in enumerate(lines):
data = line.split();
## Append random label if missing
if len(data) == 2: data = [random.randint(0,1)] + data
ref_feat = feats[data[1]]
com_feat = feats[data[2]]
ref_feat = ref_feat.cuda()
com_feat = com_feat.cuda()
# normalize feats
ref_feat = F.normalize(ref_feat, p=2, dim=1)
com_feat = F.normalize(com_feat, p=2, dim=1)
dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1)).detach().cpu().numpy();
score = -1 * numpy.mean(dist);
all_scores.append(score);
all_labels.append(int(data[0]));
all_trials.append(data[1]+" "+data[2])
if idx % print_interval == 0:
telapsed = time.time() - tstart
sys.stdout.write("\rComputing %d of %d: %.2f Hz"%(idx,len(lines),idx/telapsed));
sys.stdout.flush();
print('\n')
return (all_scores, all_labels, all_trials);
if __name__ == '__main__':
parser = argparse.ArgumentParser("baseline")
parser.add_argument("--data_root", type=str, help="", required=True)
parser.add_argument("--list", type=str, help="", required=True)
parser.add_argument("--model_dir", type=str, help="model parameters for AudioEncoder", required=True)
args = parser.parse_args()
# Load the models one by one.
print("Preparing the encoder...")
# encoder.load_model(Path(args.model_dir))
print("Insert the wav file name...")
voice_encoder = VoiceEncoder().cuda()
sc, lab, trials = evaluateFromList(args.list, print_interval=100, test_path=args.data_root)
result = tuneThresholdfromScore(sc, lab, [1, 0.1]);
print('EER %2.4f'%result[1])
================================================
FILE: NeuralSeq/data_gen/tts/txt_processors/__init__.py
================================================
from . import en
================================================
FILE: NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py
================================================
from data_gen.tts.data_gen_utils import is_sil_phoneme
REGISTERED_TEXT_PROCESSORS = {}
def register_txt_processors(name):
def _f(cls):
REGISTERED_TEXT_PROCESSORS[name] = cls
return cls
return _f
def get_txt_processor_cls(name):
return REGISTERED_TEXT_PROCESSORS.get(name, None)
class BaseTxtProcessor:
@staticmethod
def sp_phonemes():
return ['|']
@classmethod
def process(cls, txt, preprocess_args):
raise NotImplementedError
@classmethod
def postprocess(cls, txt_struct, preprocess_args):
# remove sil phoneme in head and tail
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]):
txt_struct = txt_struct[1:]
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]):
txt_struct = txt_struct[:-1]
if preprocess_args['with_phsep']:
txt_struct = cls.add_bdr(txt_struct)
if preprocess_args['add_eos_bos']:
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]]
return txt_struct
@classmethod
def add_bdr(cls, txt_struct):
txt_struct_ = []
for i, ts in enumerate(txt_struct):
txt_struct_.append(ts)
if i != len(txt_struct) - 1 and \
not is_sil_phoneme(txt_struct[i][0]) and not is_sil_phoneme(txt_struct[i + 1][0]):
txt_struct_.append(['|', ['|']])
return txt_struct_
================================================
FILE: NeuralSeq/data_gen/tts/txt_processors/en.py
================================================
import re
import unicodedata
from g2p_en import G2p
from g2p_en.expand import normalize_numbers
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor, register_txt_processors
from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
class EnG2p(G2p):
word_tokenize = TweetTokenizer().tokenize
def __call__(self, text):
# preprocessing
words = EnG2p.word_tokenize(text)
tokens = pos_tag(words) # tuples of (word, tag)
# steps
prons = []
for word, pos in tokens:
if re.search("[a-z]", word) is None:
pron = [word]
elif word in self.homograph2features: # Check homograph
pron1, pron2, pos1 = self.homograph2features[word]
if pos.startswith(pos1):
pron = pron1
else:
pron = pron2
elif word in self.cmu: # lookup CMU dict
pron = self.cmu[word][0]
else: # predict for oov
pron = self.predict(word)
prons.extend(pron)
prons.extend([" "])
return prons[:-1]
@register_txt_processors('en')
class TxtProcessor(BaseTxtProcessor):
g2p = EnG2p()
@staticmethod
def preprocess_text(text):
text = normalize_numbers(text)
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = text.lower()
text = re.sub("[\'\"()]+", "", text)
text = re.sub("[-]+", " ", text)
text = re.sub(f"[^ a-z{PUNCS}]", "", text)
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> !
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
text = text.replace("i.e.", "that is")
text = text.replace("i.e.", "that is")
text = text.replace("etc.", "etc")
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
text = re.sub(rf"\s+", r" ", text)
return text
@classmethod
def process(cls, txt, preprocess_args):
txt = cls.preprocess_text(txt).strip()
phs = cls.g2p(txt)
txt_struct = [[w, []] for w in txt.split(" ")]
i_word = 0
for p in phs:
if p == ' ':
i_word += 1
else:
txt_struct[i_word][1].append(p)
txt_struct = cls.postprocess(txt_struct, preprocess_args)
return txt_struct, txt
================================================
FILE: NeuralSeq/data_gen/tts/txt_processors/zh.py
================================================
import re
import jieba
from pypinyin import pinyin, Style
from data_gen.tts.data_gen_utils import PUNCS
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
from utils.text_norm import NSWNormalizer
class TxtProcessor(BaseTxtProcessor):
table = {ord(f): ord(t) for f, t in zip(
u':,。!?【】()%#@&1234567890',
u':,.!?[]()%#@&1234567890')}
@staticmethod
def preprocess_text(text):
text = text.translate(TxtProcessor.table)
text = NSWNormalizer(text).normalize(remove_punc=False)
text = re.sub("[\'\"()]+", "", text)
text = re.sub("[-]+", " ", text)
text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> !
text = re.sub(f"([{PUNCS}])", r" \1 ", text)
text = re.sub(rf"\s+", r"", text)
text = re.sub(rf"[A-Za-z]+", r"$", text)
return text
@classmethod
def process(cls, txt, pre_align_args):
txt = cls.preprocess_text(txt)
shengmu = pinyin(txt, style=Style.INITIALS) # https://blog.csdn.net/zhoulei124/article/details/89055403
yunmu_finals = pinyin(txt, style=Style.FINALS)
yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
if pre_align_args['use_tone'] else yunmu_finals
assert len(shengmu) == len(yunmu)
phs = ["|"]
for a, b, c in zip(shengmu, yunmu, yunmu_finals):
if a[0] == c[0]:
phs += [a[0], "|"]
else:
phs += [a[0], b[0], "|"]
return phs, txt
================================================
FILE: NeuralSeq/data_gen/tts/txt_processors/zh_g2pM.py
================================================
import re
import jieba
from pypinyin import pinyin, Style
from data_gen.tts.data_gen_utils import PUNCS
from data_gen.tts.txt_processors import zh
from g2pM import G2pM
ALL_SHENMU = ['zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j',
'q', 'x', 'r', 'z', 'c', 's', 'y', 'w']
ALL_YUNMU = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia', 'ian',
'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong', 'ou',
'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've', 'vn']
class TxtProcessor(zh.TxtProcessor):
model = G2pM()
@staticmethod
def sp_phonemes():
return ['|', '#']
@classmethod
def process(cls, txt, pre_align_args):
txt = cls.preprocess_text(txt)
ph_list = cls.model(txt, tone=pre_align_args['use_tone'], char_split=True)
seg_list = '#'.join(jieba.cut(txt))
assert len(ph_list) == len([s for s in seg_list if s != '#']), (ph_list, seg_list)
# 加入词边界'#'
ph_list_ = []
seg_idx = 0
for p in ph_list:
p = p.replace("u:", "v")
if seg_list[seg_idx] == '#':
ph_list_.append('#')
seg_idx += 1
else:
ph_list_.append("|")
seg_idx += 1
if re.findall('[\u4e00-\u9fff]', p):
if pre_align_args['use_tone']:
p = pinyin(p, style=Style.TONE3, strict=True)[0][0]
if p[-1] not in ['1', '2', '3', '4', '5']:
p = p + '5'
else:
p = pinyin(p, style=Style.NORMAL, strict=True)[0][0]
finished = False
if len([c.isalpha() for c in p]) > 1:
for shenmu in ALL_SHENMU:
if p.startswith(shenmu) and not p.lstrip(shenmu).isnumeric():
ph_list_ += [shenmu, p.lstrip(shenmu)]
finished = True
break
if not finished:
ph_list_.append(p)
ph_list = ph_list_
# 去除静音符号周围的词边界标记 [..., '#', ',', '#', ...]
sil_phonemes = list(PUNCS) + TxtProcessor.sp_phonemes()
ph_list_ = []
for i in range(0, len(ph_list), 1):
if ph_list[i] != '#' or (ph_list[i - 1] not in sil_phonemes and ph_list[i + 1] not in sil_phonemes):
ph_list_.append(ph_list[i])
ph_list = ph_list_
return ph_list, txt
if __name__ == '__main__':
phs, txt = TxtProcessor.process('他来到了,网易杭研大厦', {'use_tone': True})
print(phs)
================================================
FILE: NeuralSeq/data_gen/tts/wav_processors/__init__.py
================================================
from . import base_processor
from . import common_processors
================================================
FILE: NeuralSeq/data_gen/tts/wav_processors/base_processor.py
================================================
REGISTERED_WAV_PROCESSORS = {}
def register_wav_processors(name):
def _f(cls):
REGISTERED_WAV_PROCESSORS[name] = cls
return cls
return _f
def get_wav_processor_cls(name):
return REGISTERED_WAV_PROCESSORS.get(name, None)
class BaseWavProcessor:
@property
def name(self):
raise NotImplementedError
def output_fn(self, input_fn):
return f'{input_fn[:-4]}_{self.name}.wav'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
raise NotImplementedError
================================================
FILE: NeuralSeq/data_gen/tts/wav_processors/common_processors.py
================================================
import os
import subprocess
import librosa
import numpy as np
from data_gen.tts.wav_processors.base_processor import BaseWavProcessor, register_wav_processors
from data_gen.tts.data_gen_utils import trim_long_silences
from utils.audio import save_wav, rnnoise
from utils.hparams import hparams
@register_wav_processors(name='sox_to_wav')
class ConvertToWavProcessor(BaseWavProcessor):
@property
def name(self):
return 'ToWav'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
if input_fn[-4:] == '.wav':
return input_fn, sr
else:
output_fn = self.output_fn(input_fn)
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -t wav "{output_fn}"', shell=True)
return output_fn, sr
@register_wav_processors(name='sox_resample')
class ResampleProcessor(BaseWavProcessor):
@property
def name(self):
return 'Resample'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
sr_file = librosa.core.get_samplerate(input_fn)
if sr != sr_file:
subprocess.check_call(f'sox -v 0.95 "{input_fn}" -r{sr} "{output_fn}"', shell=True)
y, _ = librosa.core.load(input_fn, sr=sr)
y, _ = librosa.effects.trim(y)
save_wav(y, output_fn, sr)
return output_fn, sr
else:
return input_fn, sr
@register_wav_processors(name='trim_sil')
class TrimSILProcessor(BaseWavProcessor):
@property
def name(self):
return 'TrimSIL'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
y, _ = librosa.core.load(input_fn, sr=sr)
y, _ = librosa.effects.trim(y)
save_wav(y, output_fn, sr)
return output_fn
@register_wav_processors(name='trim_all_sil')
class TrimAllSILProcessor(BaseWavProcessor):
@property
def name(self):
return 'TrimSIL'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
y, audio_mask, _ = trim_long_silences(
input_fn, vad_max_silence_length=preprocess_args.get('vad_max_silence_length', 12))
save_wav(y, output_fn, sr)
if preprocess_args['save_sil_mask']:
os.makedirs(f'{processed_dir}/sil_mask', exist_ok=True)
np.save(f'{processed_dir}/sil_mask/{item_name}.npy', audio_mask)
return output_fn, sr
@register_wav_processors(name='denoise')
class DenoiseProcessor(BaseWavProcessor):
@property
def name(self):
return 'Denoise'
def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, preprocess_args):
output_fn = self.output_fn(input_fn)
rnnoise(input_fn, output_fn, out_sample_rate=sr)
return output_fn, sr
================================================
FILE: NeuralSeq/egs/datasets/audio/emotion/base_text2mel.yaml
================================================
raw_data_dir: 'data/raw/ESD'
processed_data_dir: 'data/processed/emotion'
binary_data_dir: 'data/binary/emotion'
pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
audio_sample_rate: 16000
binarization_args:
shuffle: true
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
use_spk_id: true
test_num: 200
num_spk: 10
pitch_type: frame
min_frames: 128
num_test_samples: 30
mel_loss: "ssim:0.5|l1:0.5"
vocoder_ckpt: ''
use_emotion: true
================================================
FILE: NeuralSeq/egs/datasets/audio/emotion/pre_align.py
================================================
import os
from data_gen.tts.base_preprocess import BasePreprocessor
import glob
import re
class EmoPreAlign(BasePreprocessor):
def meta_data(self):
spks = ['0012', '0011', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']
pattern = re.compile('[\t\n ]+')
for spk in spks:
for line in open(f"{self.raw_data_dir}/{spk}/{spk}.txt", 'r'): # 打开文件
line = re.sub(pattern, ' ', line)
if line == ' ': continue
split_ = line.split(' ')
txt = ' '.join(split_[1: -2])
item_name = split_[0]
emotion = split_[-2]
wav_fn = f'{self.raw_data_dir}/{spk}/{emotion}/{item_name}.wav'
yield item_name, wav_fn, txt, spk, emotion
if __name__ == "__main__":
EmoPreAlign().process()
================================================
FILE: NeuralSeq/egs/datasets/audio/libritts/base_text2mel.yaml
================================================
raw_data_dir: 'data/raw/LibriTTS'
processed_data_dir: 'data/processed/libritts'
binary_data_dir: 'data/binary/libritts'
pre_align_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
binarization_args:
shuffle: true
use_spk_id: true
test_num: 200
num_spk: 2320
pitch_type: frame
min_frames: 128
num_test_samples: 30
mel_loss: "ssim:0.5|l1:0.5"
vocoder_ckpt: ''
================================================
FILE: NeuralSeq/egs/datasets/audio/libritts/fs2.yaml
================================================
base_config:
- egs/egs_bases/tts/fs2.yaml
- ./base_text2mel.yaml
================================================
FILE: NeuralSeq/egs/datasets/audio/libritts/pre_align.py
================================================
import os
from data_gen.tts.base_preprocess import BasePreprocessor
import glob
class LibrittsPreAlign(BasePreprocessor):
def meta_data(self):
wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*.wav'))
for wav_fn in wav_fns:
item_name = os.path.basename(wav_fn)[:-4]
txt_fn = f'{wav_fn[:-4]}.normalized.txt'
with open(txt_fn, 'r') as f:
txt = f.readlines()
f.close()
spk = item_name.split("_")[0]
# Example:
#
# 'item_name': '103_1241_000000_000001'
# 'wav_fn': 'LibriTTS/train-clean-100/103/1241/103_1241_000000_000001.wav'
# 'txt': 'matthew Cuthbert is surprised'
# 'spk_name': '103'
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt[0], 'spk_name': spk}
if __name__ == "__main__":
LibrittsPreAlign().process()
================================================
FILE: NeuralSeq/egs/datasets/audio/libritts/pwg.yaml
================================================
base_config: egs/egs_bases/tts/vocoder/pwg.yaml
raw_data_dir: 'data/raw/LibriTTS'
processed_data_dir: 'data/processed/libritts'
binary_data_dir: 'data/binary/libritts_wav'
generator_params:
kernel_size: 5
num_spk: 400
max_samples: 20480
================================================
FILE: NeuralSeq/egs/datasets/audio/lj/base_mel2wav.yaml
================================================
raw_data_dir: 'data/raw/LJSpeech-1.1'
processed_data_dir: 'data/processed/ljspeech'
binary_data_dir: 'data/binary/ljspeech_wav'
binarization_args:
with_spk_embed: false
================================================
FILE: NeuralSeq/egs/datasets/audio/lj/preprocess.py
================================================
from data_gen.tts.base_preprocess import BasePreprocessor
class LJPreprocess(BasePreprocessor):
def meta_data(self):
for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
item_name, _, txt = l.strip().split("|")
wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
================================================
FILE: NeuralSeq/egs/datasets/audio/lj/pwg.yaml
================================================
base_config:
- egs/egs_bases/tts/vocoder/pwg.yaml
- ./base_mel2wav.yaml
================================================
FILE: NeuralSeq/egs/datasets/audio/vctk/base_mel2wav.yaml
================================================
raw_data_dir: 'data/raw/VCTK-Corpus'
processed_data_dir: 'data/processed/vctk'
binary_data_dir: 'data/binary/vctk_wav'
================================================
FILE: NeuralSeq/egs/datasets/audio/vctk/fs2.yaml
================================================
base_config:
- egs/egs_bases/tts/fs2.yaml
raw_data_dir: 'data/raw/VCTK-Corpus'
processed_data_dir: 'data/processed/vctk'
binary_data_dir: 'data/binary/vctk'
pre_align_cls: egs.datasets.audio.vctk.pre_align.VCTKPreAlign
use_spk_id: true
test_num: 200
num_spk: 400
binarization_args:
shuffle: true
trim_eos_bos: true
================================================
FILE: NeuralSeq/egs/datasets/audio/vctk/pre_align.py
================================================
import os
from data_gen.tts.base_pre_align import BasePreAlign
import glob
class VCTKPreAlign(BasePreAlign):
def meta_data(self):
wav_fns = glob.glob(f'{self.raw_data_dir}/wav48/*/*.wav')
for wav_fn in wav_fns:
item_name = os.path.basename(wav_fn)[:-4]
spk = item_name.split("_")[0]
txt_fn = wav_fn.split("/")
txt_fn[-1] = f'{item_name}.txt'
txt_fn[-3] = f'txt'
txt_fn = "/".join(txt_fn)
if os.path.exists(txt_fn) and os.path.exists(wav_fn):
yield item_name, wav_fn, (self.load_txt, txt_fn), spk
if __name__ == "__main__":
VCTKPreAlign().process()
================================================
FILE: NeuralSeq/egs/datasets/audio/vctk/pwg.yaml
================================================
base_config:
- egs/egs_bases/tts/vocoder/pwg.yaml
- ./base_mel2wav.yaml
num_spk: 400
max_samples: 20480
================================================
FILE: NeuralSeq/egs/egs_bases/config_base.yaml
================================================
# task
binary_data_dir: ''
work_dir: '' # experiment directory.
infer: false # inference
amp: false
seed: 1234
debug: false
save_codes: []
# - configs
# - modules
# - tasks
# - utils
# - usr
#############
# dataset
#############
ds_workers: 1
test_num: 100
endless_ds: false
sort_by_len: true
#########
# train and eval
#########
print_nan_grads: false
load_ckpt: ''
save_best: true
num_ckpt_keep: 3
clip_grad_norm: 0
accumulate_grad_batches: 1
tb_log_interval: 100
num_sanity_val_steps: 5 # steps of validation at the beginning
check_val_every_n_epoch: 10
val_check_interval: 2000
valid_monitor_key: 'val_loss'
valid_monitor_mode: 'min'
max_epochs: 1000
max_updates: 1000000
max_tokens: 31250
max_sentences: 100000
max_valid_tokens: -1
max_valid_sentences: -1
test_input_dir: ''
resume_from_checkpoint: 0
rename_tmux: true
================================================
FILE: NeuralSeq/egs/egs_bases/svs/base.yaml
================================================
task_cls: tasks.svs.task.DiffFsTask
pitch_type: frame
timesteps: 100
dilation_cycle_length: 1
residual_layers: 20
residual_channels: 256
lr: 0.001
decay_steps: 50000
keep_bins: 80
spec_min: [ ]
spec_max: [ ]
content_cond_steps: [ ] # [ 0, 10000 ]
spk_cond_steps: [ ] # [ 0, 10000 ]
# train and eval
fs2_ckpt: ''
max_updates: 400000
# max_updates: 200000
use_gt_dur: true
use_gt_f0: true
gen_tgt_spk_id: -1
max_sentences: 48
num_sanity_val_steps: 1
num_valid_plots: 1
================================================
FILE: NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml
================================================
base_config:
- configs/tts/lj/fs2.yaml
- ./base.yaml
# spec_min and spec_max are calculated on the training set.
spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
-4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
-4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
-5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
-5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
-4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
-5.0483, -5.0848, -5.1809, -5.0677, -5.0015, -5.0792, -5.0636, -5.2413,
-5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
-5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
-5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.7093,
0.6461, 0.6420, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591,
0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492,
0.6909, 0.6106, 0.5761, 0.5936, 0.5638, 0.4054, 0.4545, 0.3589,
0.3037, 0.3380, 0.1599, 0.2433, 0.2741, 0.2130, 0.1569, 0.1911,
0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933,
-0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
-0.1244, -0.2116, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000,
0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566,
0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ]
task_cls: tasks.svs.diffspeech_task.DiffSpeechTask
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0414_hifi_lj_1
num_valid_plots: 10
use_gt_dur: false
use_gt_f0: false
pitch_type: cwt
pitch_extractor: 'parselmouth'
max_updates: 160000
lr: 0.001
timesteps: 100
K_step: 71
diff_loss_type: l1
diff_decoder_type: 'wavenet'
schedule_type: 'linear'
max_beta: 0.06
fs2_ckpt: checkpoints/fs2_lj_1/model_ckpt_steps_150000.ckpt
save_gt: true
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml
================================================
base_config:
- configs/singing/fs2.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
audio_sample_rate: 24000
hop_size: 128 # Hop size.
fft_size: 512 # FFT size.
win_size: 512 # FFT size.
fmin: 30
fmax: 12000
min_level_db: -120
binarization_args:
with_wav: true
with_spk_embed: false
with_align: true
raw_data_dir: 'data/raw/opencpop/segments'
processed_data_dir: 'xxx'
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
use_midi: true # for midi exp
use_gt_f0: false # for midi exp
use_gt_dur: false # for further midi exp
lambda_f0: 1.0
lambda_uv: 1.0
#lambda_energy: 0.1
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
pe_enable: false
pe_ckpt: ''
num_spk: 1
test_prefixes: [
'2044',
'2086',
'2092',
'2093',
'2100',
]
task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
use_nsf: true
# config for experiments
max_frames: 5000
max_tokens: 40000
predictor_layers: 5
rel_pos: true
dur_predictor_layers: 5 # *
use_spk_embed: false
num_valid_plots: 10
max_updates: 160000
save_gt: true
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml
================================================
base_config:
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_f0: false # for midi exp
use_gt_dur: false # for further midi exp
lambda_f0: 1.0
lambda_uv: 1.0
#lambda_energy: 0.1
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
pe_enable: false
pe_ckpt: ''
fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' #
#num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 60
max_tokens: 36000
predictor_layers: 5
dilation_cycle_length: 4 # *
rel_pos: true
dur_predictor_layers: 5 # *
max_updates: 160000
gaussian_start: false
mask_uv_prob: 0.15
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
================================================
spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,
-6., -6., -6., -6., -6., -6., -6., -6.]
spec_max: [-7.9453e-01, -8.1116e-01, -6.1631e-01, -3.0679e-01, -1.3863e-01,
-5.0652e-02, -1.1563e-01, -1.0679e-01, -9.1068e-02, -6.2174e-02,
-7.5302e-02, -7.2217e-02, -6.3815e-02, -7.3299e-02, 7.3610e-03,
-7.2508e-02, -5.0234e-02, -1.6534e-01, -2.6928e-01, -2.0782e-01,
-2.0823e-01, -1.1702e-01, -7.0128e-02, -6.5868e-02, -1.2675e-02,
1.5121e-03, -8.9902e-02, -2.1392e-01, -2.3789e-01, -2.8922e-01,
-3.0405e-01, -2.3029e-01, -2.2088e-01, -2.1542e-01, -2.9367e-01,
-3.0137e-01, -3.8281e-01, -4.3590e-01, -2.8681e-01, -4.6855e-01,
-5.7485e-01, -4.7022e-01, -5.4266e-01, -4.4848e-01, -6.4120e-01,
-6.8700e-01, -6.4860e-01, -7.6436e-01, -4.9971e-01, -7.1068e-01,
-6.9724e-01, -6.1487e-01, -5.5843e-01, -6.9773e-01, -5.7502e-01,
-7.0919e-01, -8.2431e-01, -8.4213e-01, -9.0431e-01, -8.2840e-01,
-7.7945e-01, -8.2758e-01, -8.7699e-01, -1.0532e+00, -1.0766e+00,
-1.1198e+00, -1.0185e+00, -9.8983e-01, -1.0001e+00, -1.0756e+00,
-1.0024e+00, -1.0304e+00, -1.0579e+00, -1.0188e+00, -1.0500e+00,
-1.0842e+00, -1.0923e+00, -1.1223e+00, -1.2381e+00, -1.6467e+00]
mel_vmin: -6. #-6.
mel_vmax: 1.5
wav2spec_eps: 1e-6
raw_data_dir: 'data/raw/opencpop/segments'
processed_data_dir: 'xxx'
binary_data_dir: 'data/binary/opencpop-midi-dp'
datasets: [
'opencpop',
]
test_prefixes: [
'2044',
'2086',
'2092',
'2093',
'2100',
]
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml
================================================
base_config:
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
timesteps: 1000
K_step: 1000
max_beta: 0.02
max_tokens: 36000
max_updates: 320000
gaussian_start: True
use_pitch_embed: false
use_gt_f0: false # for midi exp
lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 10 # *
rel_pos: true
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml
================================================
base_config:
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
# for diffusion schedule
timesteps: 1000
K_step: 1000
max_beta: 0.02
max_tokens: 36000
max_updates: 320000
gaussian_start: True
pndm_speedup: 10
use_pitch_embed: false
use_gt_f0: false # for midi exp
lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 4 # *
rel_pos: true
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml
================================================
base_config:
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 36000
max_updates: 160000
gaussian_start: True
use_pitch_embed: false
use_gt_f0: false # for midi exp
lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 4 # *
rel_pos: true
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml
================================================
base_config:
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml
binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
binary_data_dir: 'data/binary/popcs-midi-dp'
#switch_midi2f0_step: 174000
use_midi: true # for midi exp
use_gt_dur: false # for further midi exp
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 40000
max_updates: 160000
gaussian_start: True
use_pitch_embed: false
use_gt_f0: false # for midi exp
lambda_f0: 0.
lambda_uv: 0.
dilation_cycle_length: 4 # *
rel_pos: true
predictor_layers: 5
pe_enable: true
pe_ckpt: 'checkpoints/0102_xiaoma_pe'
================================================
FILE: NeuralSeq/egs/egs_bases/svs/midi/pe.yaml
================================================
base_config:
- configs/tts/lj/fs2.yaml
max_frames: 8000
audio_sample_rate: 24000
hop_size: 128 # Hop size.
fft_size: 512 # FFT size.
win_size: 512 # FFT size.
fmin: 30
fmax: 12000
min_level_db: -120
binary_data_dir: 'xxx'
pitch_type: frame
task_cls: tasks.tts.pe.PitchExtractionTask
pitch_extractor_conv_layers: 2
# config for experiments
max_tokens: 20000
use_spk_embed: false
num_valid_plots: 10
max_updates: 60000
================================================
FILE: NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml
================================================
base_config:
- configs/tts/fs2.yaml
- configs/singing/base.yaml
- ./base.yaml
audio_sample_rate: 24000
hop_size: 128 # Hop size.
fft_size: 512 # FFT size.
win_size: 512 # FFT size.
fmin: 30
fmax: 12000
min_level_db: -120
binarization_args:
with_wav: true
with_spk_embed: false
with_align: true
raw_data_dir: 'data/raw/popcs'
processed_data_dir: 'data/processed/popcs'
binary_data_dir: 'data/binary/popcs-pmf0'
num_spk: 1
datasets: [
'popcs',
]
test_prefixes: [
'popcs-说散就散',
'popcs-隐形的翅膀',
]
spec_min: [-6.8276, -7.0270, -6.8142, -7.1429, -7.6669, -7.6000, -7.1148, -6.9640,
-6.8414, -6.6596, -6.6880, -6.7439, -6.7986, -7.4940, -7.7845, -7.6586,
-6.9288, -6.7639, -6.9118, -6.8246, -6.7183, -7.1769, -6.9794, -7.4513,
-7.3422, -7.5623, -6.9610, -6.8158, -6.9595, -6.8403, -6.5688, -6.6356,
-7.0209, -6.5002, -6.7819, -6.5232, -6.6927, -6.5701, -6.5531, -6.7069,
-6.6462, -6.4523, -6.5954, -6.4264, -6.4487, -6.7070, -6.4025, -6.3042,
-6.4008, -6.3857, -6.3903, -6.3094, -6.2491, -6.3518, -6.3566, -6.4168,
-6.2481, -6.3624, -6.2858, -6.2575, -6.3638, -6.4520, -6.1835, -6.2754,
-6.1253, -6.1645, -6.0638, -6.1262, -6.0710, -6.1039, -6.4428, -6.1363,
-6.1054, -6.1252, -6.1797, -6.0235, -6.0758, -5.9453, -6.0213, -6.0446]
spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.1212,
0.2421, 0.1809, 0.2134, 0.3161, 0.3301, 0.3289, 0.2667, 0.2421,
0.2581, 0.2600, 0.1394, 0.1907, 0.1082, 0.1474, 0.1680, 0.2550,
0.1057, 0.0826, 0.0423, 0.1203, -0.0701, -0.0056, 0.0477, -0.0639,
-0.0272, -0.0728, -0.1648, -0.0855, -0.2652, -0.1998, -0.1547, -0.2167,
-0.4181, -0.5463, -0.4161, -0.4733, -0.6518, -0.5387, -0.4290, -0.4191,
-0.4151, -0.3042, -0.3810, -0.4160, -0.4496, -0.2847, -0.4676, -0.4658,
-0.4931, -0.4885, -0.5547, -0.5481, -0.6948, -0.7968, -0.8455, -0.8392,
-0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
-0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
task_cls: tasks.svs.diffsinger_task.DiffSingerTask
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
pitch_extractor: 'parselmouth'
# config for experiments
use_spk_embed: false
num_valid_plots: 10
max_updates: 160000
lr: 0.001
timesteps: 100
K_step: 51
diff_loss_type: l1
diff_decoder_type: 'wavenet'
schedule_type: 'linear'
max_beta: 0.06
fs2_ckpt: ''
use_nsf: true
================================================
FILE: NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml
================================================
base_config:
- ./popcs_ds_beta6.yaml
fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer
num_valid_plots: 0
task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask
# tmp:
#pe_enable: true
#pe_ckpt: ''
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
================================================
FILE: NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml
================================================
base_config:
- configs/singing/fs2.yaml
audio_sample_rate: 24000
hop_size: 128 # Hop size.
fft_size: 512 # FFT size.
win_size: 512 # FFT size.
fmin: 30
fmax: 12000
min_level_db: -120
binarization_args:
with_wav: true
with_spk_embed: false
with_align: true
raw_data_dir: 'data/raw/popcs'
processed_data_dir: 'data/processed/popcs'
binary_data_dir: 'data/binary/popcs-pmf0'
num_spk: 1
datasets: [
'popcs',
]
test_prefixes: [
'popcs-说散就散',
'popcs-隐形的翅膀',
]
task_cls: tasks.tts.fs2.FastSpeech2Task
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
use_nsf: true
# config for experiments
max_tokens: 18000
use_spk_embed: false
num_valid_plots: 10
max_updates: 160000
save_gt: true
# tmp:
#pe_enable: true
#pe_ckpt: ''
================================================
FILE: NeuralSeq/egs/egs_bases/tts/base.yaml
================================================
# task
base_config: ../config_base.yaml
task_cls: ''
#############
# dataset
#############
raw_data_dir: ''
processed_data_dir: ''
binary_data_dir: ''
dict_dir: ''
pre_align_cls: ''
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
pre_align_args:
txt_processor: en
use_tone: true # for ZH
sox_resample: false
sox_to_wav: false
allow_no_txt: false
trim_sil: false
denoise: false
binarization_args:
shuffle: false
with_txt: true
with_wav: false
with_align: true
with_spk_embed: false
with_spk_id: true
with_f0: true
with_f0cwt: false
with_linear: false
with_word: true
trim_sil: false
trim_eos_bos: false
reset_phone_dict: true
reset_word_dict: true
word_size: 30000
pitch_extractor: parselmouth
loud_norm: false
endless_ds: true
test_num: 100
min_frames: 0
max_frames: 1548
frames_multiple: 1
max_input_tokens: 1550
audio_num_mel_bins: 80
audio_sample_rate: 22050
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600 # To be increased/reduced depending on data.
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
ref_level_db: 20
griffin_lim_iters: 60
num_spk: 1
mel_vmin: -6
mel_vmax: 1.5
ds_workers: 1
#########
# model
#########
dropout: 0.1
enc_layers: 4
dec_layers: 4
hidden_size: 256
num_heads: 2
enc_ffn_kernel_size: 9
dec_ffn_kernel_size: 9
ffn_act: gelu
ffn_padding: 'SAME'
use_spk_id: false
use_split_spk_id: false
use_spk_embed: false
###########
# optimization
###########
lr: 2.0
scheduler: rsqrt # rsqrt|none
warmup_updates: 8000
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
weight_decay: 0
clip_grad_norm: 1
clip_grad_value: 0
###########
# train and eval
###########
max_tokens: 30000
max_sentences: 100000
max_valid_sentences: 1
max_valid_tokens: 60000
valid_infer_interval: 10000
train_set_name: 'train'
train_sets: ''
valid_set_name: 'valid'
test_set_name: 'test'
num_test_samples: 0
num_valid_plots: 10
test_ids: [ ]
vocoder_denoise_c: 0.0
profile_infer: false
out_wav_norm: false
save_gt: true
save_f0: false
gen_dir_name: ''
================================================
FILE: NeuralSeq/egs/egs_bases/tts/base_zh.yaml
================================================
base_config: ./base.yaml
preprocess_args:
txt_processor: zh
use_tone: true
word_size: 3000
================================================
FILE: NeuralSeq/egs/egs_bases/tts/fs2.yaml
================================================
base_config: ./base.yaml
task_cls: tasks.tts.fs2.FastSpeech2Task
# model
hidden_size: 256
dropout: 0.1
encoder_type: fft # rel_fft|fft|tacotron|tacotron2|conformer
decoder_type: fft # fft|rnn|conv|conformer|wn
# rnn enc/dec
encoder_K: 8
decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
# fft enc/dec
use_pos_embed: true
dec_num_heads: 2
dec_layers: 4
ffn_hidden_size: 1024
enc_ffn_kernel_size: 9
dec_ffn_kernel_size: 9
# conv enc/dec
enc_dec_norm: ln
conv_use_pos: false
layers_in_block: 2
enc_dilations: [ 1, 1, 1, 1 ]
enc_kernel_size: 5
dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
dec_kernel_size: 5
dur_loss: mse # huber|mol
# duration
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 2
dur_predictor_kernel: 3
dur_predictor_layers: 2
predictor_dropout: 0.5
# pitch and energy
pitch_norm: standard # standard|log
use_pitch_embed: true
pitch_type: frame # frame|ph|cwt
use_uv: true
cwt_hidden_size: 128
cwt_layers: 2
cwt_loss: l1
cwt_add_f0_loss: false
cwt_std_scale: 0.8
pitch_ar: false
pitch_embed_type: 0
pitch_loss: 'l1' # l1|l2|ssim
pitch_ssim_win: 11
use_energy_embed: false
# reference encoder and speaker embedding
use_ref_enc: false
use_var_enc: false
lambda_commit: 0.25
var_enc_vq_codes: 64
ref_norm_layer: bn
dec_inp_add_noise: false
sil_add_noise: false
ref_hidden_stride_kernel:
- 0,3,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
- 0,3,5
- 0,2,5
- 0,2,5
- 0,2,5
pitch_enc_hidden_stride_kernel:
- 0,2,5 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
- 0,2,5
- 0,2,5
dur_enc_hidden_stride_kernel:
- 0,2,3 # conv_hidden_size, conv_stride, conv_kernel_size. conv_hidden_size=0: use hidden_size
- 0,2,3
- 0,1,3
# mel
mel_loss: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
# loss lambda
lambda_f0: 1.0
lambda_uv: 1.0
lambda_energy: 0.1
lambda_ph_dur: 0.1
lambda_sent_dur: 1.0
lambda_word_dur: 1.0
predictor_grad: 0.1
# train and eval
pretrain_fs_ckpt: ''
warmup_updates: 2000
max_tokens: 32000
max_sentences: 100000
max_valid_sentences: 1
max_updates: 120000
use_gt_dur: false
use_gt_f0: false
ds_workers: 2
lr: 1.0
================================================
FILE: NeuralSeq/egs/egs_bases/tts/fs2_adv.yaml
================================================
base_config: ./fs2.yaml
task_cls: tasks.tts.fs2_adv.FastSpeech2AdvTask
disc_win_num: 3
disc_interval: 1
disc_reduction: stack # stack|sum|none
disc_start_steps: 0
rerun_gen: false
disc_norm: in
mel_disc_hidden_size: 128
# mel decoder
mel_gan: true
lambda_mel_adv: 0.1
mel_hidden_size: 256
# others
dropout: 0.05
pitch_embed_type: 0
enc_ffn_kernel_size: 9
dec_ffn_kernel_size: 9
use_cond_disc: false
optimizer_adam_beta1: 0.5
optimizer_adam_beta2: 0.999
generator_grad_norm: 5.0 # Generator's gradient norm.
disc_hidden_size: 128
disc_lr: 0.0001 # Discriminator's learning rate.
discriminator_optimizer_params:
eps: 1.0e-6 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_scheduler_params:
step_size: 60000 # Discriminator's scheduler step size.
gamma: 0.5 # D5iscriminator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
discriminator_grad_norm: 1 # Discriminator's gradient norm.
max_updates: 400000
max_tokens: 30000
max_sentences: 80
val_check_interval: 2000
gen_dir_name: ''
num_ckpt_keep: 2
save_best: false
================================================
FILE: NeuralSeq/egs/egs_bases/tts/ps.yaml
================================================
base_config: ./fs2.yaml
###########################
# models
###########################
# encoders
hidden_size: 192
ffn_hidden_size: 768
enc_ffn_kernel_size: 5
enc_layers: 4
dur_level: word
encoder_type: rel_fft
use_word_encoder: true
# mix ling encoder
word_enc_layers: 4
word_encoder_type: rel_fft
use_pitch_embed: false
enc_prenet: true
enc_pre_ln: true
text_encoder_postnet: true
dropout: 0.0
add_word_pos: true
# dur predictor
dur_predictor_layers: 3
dur_predictor_kernel: 5
predictor_dropout: 0.2
## fvae
use_fvae: true
latent_size: 16
fvae_encoder_type: conv
fvae_decoder_type: conv
fvae_enc_dec_hidden: 192
fvae_kernel_size: 5
fvae_enc_n_layers: 8
fvae_dec_n_layers: 4
fvae_strides: 4
fvae_noise_scale: 1.0
# prior flow
use_prior_flow: true
prior_flow_hidden: 64
prior_flow_kernel_size: 3
prior_flow_n_blocks: 4
###########################
# training and inference
###########################
lambda_kl: 1.0
kl_min: 0.0
lambda_sent_dur: 0.0
kl_start_steps: 10000
posterior_start_steps: 0
frames_multiple: 4
num_valid_plots: 10
lr: 0.0002
warmup_updates: 8000
max_tokens: 40000
valid_infer_interval: 10000
max_sentences: 80
max_updates: 480000
================================================
FILE: NeuralSeq/egs/egs_bases/tts/ps_flow.yaml
================================================
base_config: ./ps2.yaml
task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
use_post_flow: true
detach_postflow_input: true
post_flow_lr: 0.001
post_glow_hidden: 192
post_glow_kernel_size: 3
post_glow_n_blocks: 12
post_glow_n_block_layers: 3
post_share_cond_layers: false
share_wn_layers: 4
use_cond_proj: false
use_latent_cond: false
use_txt_cond: true
sigmoid_scale: false
post_glow_training_start: 160000
noise_scale: 0.8
infer_post_glow: true
two_stage: true
================================================
FILE: NeuralSeq/egs/egs_bases/tts/ps_flow_small.yaml
================================================
base_config: ./ps_flow.yaml
###########################
# models
###########################
# encoders
hidden_size: 128
ffn_hidden_size: 512
enc_ffn_kernel_size: 3
enc_layers: 3
word_enc_layers: 3
# dur predictor
dur_predictor_layers: 3
dur_predictor_kernel: 5
predictor_dropout: 0.2
## fvae
use_fvae: true
latent_size: 16
fvae_encoder_type: wn
fvae_decoder_type: wn
fvae_enc_dec_hidden: 128
fvae_kernel_size: 3
fvae_enc_n_layers: 8
fvae_dec_n_layers: 3
fvae_strides: 4
fvae_noise_scale: 1.0
# prior flow
use_prior_flow: true
prior_flow_hidden: 32
prior_flow_kernel_size: 3
prior_flow_n_blocks: 3
# post flow
post_glow_hidden: 128
post_glow_kernel_size: 3
post_glow_n_blocks: 8
post_glow_n_block_layers: 3
share_wn_layers: 4
noise_scale: 0.6
================================================
FILE: NeuralSeq/egs/egs_bases/tts/vocoder/base.yaml
================================================
base_config: ../base.yaml
binarization_args:
with_wav: true
with_spk_embed: false
with_align: false
with_word: false
with_txt: false
###########
# train and eval
###########
max_samples: 25600
max_sentences: 5
max_valid_sentences: 1
max_updates: 1000000
val_check_interval: 2000
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fft_size: 1024 # FFT size.
hop_size: 256 # Hop size.
win_length: null # Window length.
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
num_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation.
fmax: 7600 # Maximum frequency in mel basis calculation.
aux_context_window: 0 # Context window size for auxiliary feature.
use_pitch_embed: false
generator_grad_norm: 10 # Generator's gradient norm.
discriminator_grad_norm: 1 # Discriminator's gradient norm.
disc_start_steps: 40000 # Number of steps to start to train discriminator.
================================================
FILE: NeuralSeq/egs/egs_bases/tts/vocoder/hifigan.yaml
================================================
base_config: ./base.yaml
task_cls: tasks.vocoder.hifigan.HifiGanTask
resblock: "1"
adam_b1: 0.8
adam_b2: 0.99
upsample_rates: [ 8,8,2,2 ]
upsample_kernel_sizes: [ 16,16,4,4 ]
upsample_initial_channel: 512
resblock_kernel_sizes: [ 3,7,11 ]
resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
use_pitch_embed: false
use_fm_loss: false
use_ms_stft: false
lambda_mel: 5.0
lambda_mel_adv: 1.0
lambda_cdisc: 4.0
lambda_adv: 1.0
lr: 0.0002 # Generator's learning rate.
generator_scheduler_params:
step_size: 600
gamma: 0.999
discriminator_scheduler_params:
step_size: 600
gamma: 0.999
max_updates: 3000000
================================================
FILE: NeuralSeq/egs/egs_bases/tts/vocoder/pwg.yaml
================================================
base_config: ./base.yaml
task_cls: tasks.vocoder.pwg.PwgTask
aux_context_window: 2 # Context window size for auxiliary feature.
use_pitch_embed: false
###########################################################
# GENERATOR NETWORK ARCHITECTURE SETTING #
###########################################################
generator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Kernel size of dilated convolution.
layers: 30 # Number of residual block layers.
stacks: 3 # Number of stacks i.e., dilation cycles.
residual_channels: 64 # Number of channels in residual conv.
gate_channels: 128 # Number of channels in gated conv.
skip_channels: 64 # Number of channels in skip conv.
aux_channels: 80 # Number of channels for auxiliary feature conv.
# Must be the same as num_mels.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_net: "ConvInUpsampleNetwork" # Upsampling network architecture.
upsample_params: # Upsampling network parameters.
upsample_scales: [4, 4, 4, 4] # Upsampling scales. Prodcut of these must be the same as hop size.
use_pitch_embed: false
use_nsf: false
###########################################################
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
###########################################################
discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "LeakyReLU" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
negative_slope: 0.2 # Alpha in LeakyReLU.
rerun_gen: true
###########################################################
# STFT LOSS SETTING #
###########################################################
stft_loss_params:
fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss.
hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss
win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
window: "hann_window" # Window function for STFT-based loss
use_mel_loss: false
###########################################################
# ADVERSARIAL LOSS SETTING #
###########################################################
lambda_adv: 4.0 # Loss balancing coefficient.
###########################################################
# OPTIMIZER & SCHEDULER SETTING #
###########################################################
generator_optimizer_params:
lr: 0.0001 # Generator's learning rate.
eps: 1.0e-6 # Generator's epsilon.
weight_decay: 0.0 # Generator's weight decay coefficient.
generator_scheduler_params:
step_size: 200000 # Generator's scheduler step size.
gamma: 0.5 # Generator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
generator_grad_norm: 10 # Generator's gradient norm.
discriminator_optimizer_params:
lr: 0.00005 # Discriminator's learning rate.
eps: 1.0e-6 # Discriminator's epsilon.
weight_decay: 0.0 # Discriminator's weight decay coefficient.
discriminator_scheduler_params:
step_size: 200000 # Discriminator's scheduler step size.
gamma: 0.5 # Discriminator's scheduler gamma.
# At each step size, lr will be multiplied by this parameter.
discriminator_grad_norm: 1 # Discriminator's gradient norm.
disc_start_steps: 40000 # Number of steps to start to train discriminator.
================================================
FILE: NeuralSeq/gitattributes
================================================
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
model_ckpt_steps* filter=lfs diff=lfs merge=lfs -text
checkpoints/0831_opencpop_ds1000 filter=lfs diff=lfs merge=lfs -text
================================================
FILE: NeuralSeq/inference/svs/base_svs_infer.py
================================================
import os
import torch
import numpy as np
from modules.hifigan.hifigan import HifiGanGenerator
from vocoders.hifigan import HifiGAN
from inference.svs.opencpop.map import cpop_pinyin2ph_func
from utils import load_ckpt
from utils.hparams import set_hparams, hparams
from utils.text_encoder import TokenTextEncoder
from pypinyin import pinyin, lazy_pinyin, Style
import librosa
import glob
import re
class BaseSVSInfer:
def __init__(self, hparams, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.hparams = hparams
self.device = device
phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
"h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
"ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
"van", "ve", "vn", "w", "x", "y", "z", "zh"]
self.ph_encoder = TokenTextEncoder(None, vocab_list=phone_list, replace_oov=',')
self.pinyin2phs = cpop_pinyin2ph_func()
self.spk_map = {'opencpop': 0}
self.model = self.build_model()
self.model.eval()
self.model.to(self.device)
self.vocoder = self.build_vocoder()
self.vocoder.eval()
self.vocoder.to(self.device)
def build_model(self):
raise NotImplementedError
def forward_model(self, inp):
raise NotImplementedError
def build_vocoder(self):
base_dir = hparams['vocoder_ckpt']
config_path = f'{base_dir}/config.yaml'
ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
print('| load HifiGAN: ', ckpt)
ckpt_dict = torch.load(ckpt, map_location="cpu")
config = set_hparams(config_path, global_hparams=False)
state = ckpt_dict["state_dict"]["model_gen"]
vocoder = HifiGanGenerator(config)
vocoder.load_state_dict(state, strict=True)
vocoder.remove_weight_norm()
vocoder = vocoder.eval().to(self.device)
return vocoder
def run_vocoder(self, c, **kwargs):
c = c.transpose(2, 1) # [B, 80, T]
f0 = kwargs.get('f0') # [B, T]
if f0 is not None and hparams.get('use_nsf'):
# f0 = torch.FloatTensor(f0).to(self.device)
y = self.vocoder(c, f0).view(-1)
else:
y = self.vocoder(c).view(-1)
# [T]
return y[None]
def preprocess_word_level_input(self, inp):
# Pypinyin can't solve polyphonic words
text_raw = inp['text'].replace('最长', '最常').replace('长睫毛', '常睫毛') \
.replace('那么长', '那么常').replace('多长', '多常') \
.replace('很长', '很常') # We hope someone could provide a better g2p module for us by opening pull requests.
# lyric
pinyins = lazy_pinyin(text_raw, strict=False)
ph_per_word_lst = [self.pinyin2phs[pinyin.strip()] for pinyin in pinyins if pinyin.strip() in self.pinyin2phs]
# Note
note_per_word_lst = [x.strip() for x in inp['notes'].split('|') if x.strip() != '']
mididur_per_word_lst = [x.strip() for x in inp['notes_duration'].split('|') if x.strip() != '']
if len(note_per_word_lst) == len(ph_per_word_lst) == len(mididur_per_word_lst):
print('Pass word-notes check.')
else:
print('The number of words does\'t match the number of notes\' windows. ',
'You should split the note(s) for each word by | mark.')
print(ph_per_word_lst, note_per_word_lst, mididur_per_word_lst)
print(len(ph_per_word_lst), len(note_per_word_lst), len(mididur_per_word_lst))
return None
note_lst = []
ph_lst = []
midi_dur_lst = []
is_slur = []
for idx, ph_per_word in enumerate(ph_per_word_lst):
# for phs in one word:
# single ph like ['ai'] or multiple phs like ['n', 'i']
ph_in_this_word = ph_per_word.split()
# for notes in one word:
# single note like ['D4'] or multiple notes like ['D4', 'E4'] which means a 'slur' here.
note_in_this_word = note_per_word_lst[idx].split()
midi_dur_in_this_word = mididur_per_word_lst[idx].split()
# process for the model input
# Step 1.
# Deal with note of 'not slur' case or the first note of 'slur' case
# j ie
# F#4/Gb4 F#4/Gb4
# 0 0
for ph in ph_in_this_word:
ph_lst.append(ph)
note_lst.append(note_in_this_word[0])
midi_dur_lst.append(midi_dur_in_this_word[0])
is_slur.append(0)
# step 2.
# Deal with the 2nd, 3rd... notes of 'slur' case
# j ie ie
# F#4/Gb4 F#4/Gb4 C#4/Db4
# 0 0 1
if len(note_in_this_word) > 1: # is_slur = True, we should repeat the YUNMU to match the 2nd, 3rd... notes.
for idx in range(1, len(note_in_this_word)):
ph_lst.append(ph_in_this_word[-1])
note_lst.append(note_in_this_word[idx])
midi_dur_lst.append(midi_dur_in_this_word[idx])
is_slur.append(1)
ph_seq = ' '.join(ph_lst)
if len(ph_lst) == len(note_lst) == len(midi_dur_lst):
print(len(ph_lst), len(note_lst), len(midi_dur_lst))
print('Pass word-notes check.')
else:
print('The number of words does\'t match the number of notes\' windows. ',
'You should split the note(s) for each word by | mark.')
return None
return ph_seq, note_lst, midi_dur_lst, is_slur
def preprocess_phoneme_level_input(self, inp):
ph_seq = inp['ph_seq']
note_lst = inp['note_seq'].split()
midi_dur_lst = inp['note_dur_seq'].split()
is_slur = [float(x) for x in inp['is_slur_seq'].split()]
print(len(note_lst), len(ph_seq.split()), len(midi_dur_lst))
if len(note_lst) == len(ph_seq.split()) == len(midi_dur_lst):
print('Pass word-notes check.')
else:
print('The number of words does\'t match the number of notes\' windows. ',
'You should split the note(s) for each word by | mark.')
return None
return ph_seq, note_lst, midi_dur_lst, is_slur
def preprocess_input(self, inp, input_type='word'):
"""
:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
:return:
"""
item_name = inp.get('item_name', '<ITEM_NAME>')
spk_name = inp.get('spk_name', 'opencpop')
# single spk
spk_id = self.spk_map[spk_name]
# get ph seq, note lst, midi dur lst, is slur lst.
if input_type == 'word':
ret = self.preprocess_word_level_input(inp)
elif input_type == 'phoneme': # like transcriptions.txt in Opencpop dataset.
ret = self.preprocess_phoneme_level_input(inp)
else:
print('Invalid input type.')
return None
if ret:
ph_seq, note_lst, midi_dur_lst, is_slur = ret
else:
print('==========> Preprocess_word_level or phone_level input wrong.')
return None
# convert note lst to midi id; convert note dur lst to midi duration
try:
midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
for x in note_lst]
midi_dur_lst = [float(x) for x in midi_dur_lst]
except Exception as e:
print(e)
print('Invalid Input Type.')
return None
ph_token = self.ph_encoder.encode(ph_seq)
item = {'item_name': item_name, 'text': inp['text'], 'ph': ph_seq, 'spk_id': spk_id,
'ph_token': ph_token, 'pitch_midi': np.asarray(midis), 'midi_dur': np.asarray(midi_dur_lst),
'is_slur': np.asarray(is_slur), }
item['ph_len'] = len(item['ph_token'])
return item
def input_to_batch(self, item):
item_names = [item['item_name']]
text = [item['text']]
ph = [item['ph']]
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
pitch_midi = torch.LongTensor(item['pitch_midi'])[None, :hparams['max_frames']].to(self.device)
midi_dur = torch.FloatTensor(item['midi_dur'])[None, :hparams['max_frames']].to(self.device)
is_slur = torch.LongTensor(item['is_slur'])[None, :hparams['max_frames']].to(self.device)
batch = {
'item_name': item_names,
'text': text,
'ph': ph,
'txt_tokens': txt_tokens,
'txt_lengths': txt_lengths,
'spk_ids': spk_ids,
'pitch_midi': pitch_midi,
'midi_dur': midi_dur,
'is_slur': is_slur
}
return batch
def postprocess_output(self, output):
return output
def infer_once(self, inp):
inp = self.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
output = self.forward_model(inp)
output = self.postprocess_output(output)
return output
@classmethod
def example_run(cls, inp):
from utils.audio import save_wav
set_hparams(print_hparams=False)
infer_ins = cls(hparams)
out = infer_ins.infer_once(inp)
os.makedirs('infer_out', exist_ok=True)
save_wav(out, f'infer_out/example_out.wav', hparams['audio_sample_rate'])
# if __name__ == '__main__':
# debug
# a = BaseSVSInfer(hparams)
# a.preprocess_input({'text': '你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP',
# 'notes': 'D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest',
# 'notes_duration': '0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590'
# })
# b = {
# 'text': '小酒窝长睫毛AP是你最美的记号',
# 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
# 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340'
# }
# c = {
# 'text': '小酒窝长睫毛AP是你最美的记号',
# 'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
# 'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
# 'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
# 'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
# } # input like Opencpop dataset.
# a.preprocess_input(b)
# a.preprocess_input(c, input_type='phoneme')
================================================
FILE: NeuralSeq/inference/svs/ds_cascade.py
================================================
import torch
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
from tasks.svs.diffsinger_task import DIFF_DECODERS
class DiffSingerCascadeInfer(BaseSVSInfer):
def build_model(self):
model = GaussianDiffusion(
phone_encoder=self.ph_encoder,
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
timesteps=hparams['timesteps'],
K_step=hparams['K_step'],
loss_type=hparams['diff_loss_type'],
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
)
model.eval()
load_ckpt(model, hparams['work_dir'], 'model')
return model
def forward_model(self, inp):
sample = self.input_to_batch(inp)
txt_tokens = sample['txt_tokens'] # [B, T_t]
spk_id = sample.get('spk_ids')
with torch.no_grad():
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
is_slur=sample['is_slur'])
mel_out = output['mel_out'] # [B, T,80]
f0_pred = output['f0_denorm']
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
wav_out = wav_out.cpu().numpy()
return wav_out[0]
if __name__ == '__main__':
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
'input_type': 'word'
} # user input: Chinese characters
c = {
'text': '小酒窝长睫毛AP是你最美的记号',
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
'input_type': 'phoneme'
} # input like Opencpop dataset.
DiffSingerCascadeInfer.example_run(inp)
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
================================================
FILE: NeuralSeq/inference/svs/ds_e2e.py
================================================
import torch
# from inference.tts.fs import FastSpeechInfer
# from modules.tts.fs2_orig import FastSpeech2Orig
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
from modules.diff.shallow_diffusion_tts import GaussianDiffusion
from tasks.svs.diffsinger_task import DIFF_DECODERS
from modules.fastspeech.pe import PitchExtractor
import utils
class DiffSingerE2EInfer(BaseSVSInfer):
def build_model(self):
model = GaussianDiffusion(
phone_encoder=self.ph_encoder,
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
timesteps=hparams['timesteps'],
K_step=hparams['K_step'],
loss_type=hparams['diff_loss_type'],
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
)
model.eval()
load_ckpt(model, hparams['work_dir'], 'model')
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
self.pe = PitchExtractor().to(self.device)
utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
self.pe.eval()
return model
def forward_model(self, inp):
sample = self.input_to_batch(inp)
txt_tokens = sample['txt_tokens'] # [B, T_t]
spk_id = sample.get('spk_ids')
with torch.no_grad():
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
is_slur=sample['is_slur'])
mel_out = output['mel_out'] # [B, T,80]
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
f0_pred = self.pe(mel_out)['f0_denorm_pred'] # pe predict from Pred mel
else:
f0_pred = output['f0_denorm']
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
wav_out = wav_out.cpu().numpy()
return wav_out[0]
if __name__ == '__main__':
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
'input_type': 'word'
} # user input: Chinese characters
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
'input_type': 'phoneme'
} # input like Opencpop dataset.
DiffSingerE2EInfer.example_run(inp)
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
================================================
FILE: NeuralSeq/inference/svs/opencpop/cpop_pinyin2ph.txt
================================================
| a | a |
| ai | ai |
| an | an |
| ang | ang |
| ao | ao |
| ba | b a |
| bai | b ai |
| ban | b an |
| bang | b ang |
| bao | b ao |
| bei | b ei |
| ben | b en |
| beng | b eng |
| bi | b i |
| bian | b ian |
| biao | b iao |
| bie | b ie |
| bin | b in |
| bing | b ing |
| bo | b o |
| bu | b u |
| ca | c a |
| cai | c ai |
| can | c an |
| cang | c ang |
| cao | c ao |
| ce | c e |
| cei | c ei |
| cen | c en |
| ceng | c eng |
| cha | ch a |
| chai | ch ai |
| chan | ch an |
| chang | ch ang |
| chao | ch ao |
| che | ch e |
| chen | ch en |
| cheng | ch eng |
| chi | ch i |
| chong | ch ong |
| chou | ch ou |
| chu | ch u |
| chua | ch ua |
| chuai | ch uai |
| chuan | ch uan |
| chuang | ch uang |
| chui | ch ui |
| chun | ch un |
| chuo | ch uo |
| ci | c i |
| cong | c ong |
| cou | c ou |
| cu | c u |
| cuan | c uan |
| cui | c ui |
| cun | c un |
| cuo | c uo |
| da | d a |
| dai | d ai |
| dan | d an |
| dang | d ang |
| dao | d ao |
| de | d e |
| dei | d ei |
| den | d en |
| deng | d eng |
| di | d i |
| dia | d ia |
| dian | d ian |
| diao | d iao |
| die | d ie |
| ding | d ing |
| diu | d iu |
| dong | d ong |
| dou | d ou |
| du | d u |
| duan | d uan |
| dui | d ui |
| dun | d un |
| duo | d uo |
| e | e |
| ei | ei |
| en | en |
| eng | eng |
| er | er |
| fa | f a |
| fan | f an |
| fang | f ang |
| fei | f ei |
| fen | f en |
| feng | f eng |
| fo | f o |
| fou | f ou |
| fu | f u |
| ga | g a |
| gai | g ai |
| gan | g an |
| gang | g ang |
| gao | g ao |
| ge | g e |
| gei | g ei |
| gen | g en |
| geng | g eng |
| gong | g ong |
| gou | g ou |
| gu | g u |
| gua | g ua |
| guai | g uai |
| guan | g uan |
| guang | g uang |
| gui | g ui |
| gun | g un |
| guo | g uo |
| ha | h a |
| hai | h ai |
| han | h an |
| hang | h ang |
| hao | h ao |
| he | h e |
| hei | h ei |
| hen | h en |
| heng | h eng |
| hm | h m |
| hng | h ng |
| hong | h ong |
| hou | h ou |
| hu | h u |
| hua | h ua |
| huai | h uai |
| huan | h uan |
| huang | h uang |
| hui | h ui |
| hun | h un |
| huo | h uo |
| ji | j i |
| jia | j ia |
| jian | j ian |
| jiang | j iang |
| jiao | j iao |
| jie | j ie |
| jin | j in |
| jing | j ing |
| jiong | j iong |
| jiu | j iu |
| ju | j v |
| juan | j van |
| jue | j ve |
| jun | j vn |
| ka | k a |
| kai | k ai |
| kan | k an |
| kang | k ang |
| kao | k ao |
| ke | k e |
| kei | k ei |
| ken | k en |
| keng | k eng |
| kong | k ong |
| kou | k ou |
| ku | k u |
| kua | k ua |
| kuai | k uai |
| kuan | k uan |
| kuang | k uang |
| kui | k ui |
| kun | k un |
| kuo | k uo |
| la | l a |
| lai | l ai |
| lan | l an |
| lang | l ang |
| lao | l ao |
| le | l e |
| lei | l ei |
| leng | l eng |
| li | l i |
| lia | l ia |
| lian | l ian |
| liang | l iang |
| liao | l iao |
| lie | l ie |
| lin | l in |
| ling | l ing |
| liu | l iu |
| lo | l o |
| long | l ong |
| lou | l ou |
| lu | l u |
| luan | l uan |
| lun | l un |
| luo | l uo |
| lv | l v |
| lve | l ve |
| m | m |
| ma | m a |
| mai | m ai |
| man | m an |
| mang | m ang |
| mao | m ao |
| me | m e |
| mei | m ei |
| men | m en |
| meng | m eng |
| mi | m i |
| mian | m ian |
| miao | m iao |
| mie | m ie |
| min | m in |
| ming | m ing |
| miu | m iu |
| mo | m o |
| mou | m ou |
| mu | m u |
| n | n |
| na | n a |
| nai | n ai |
| nan | n an |
| nang | n ang |
| nao | n ao |
| ne | n e |
| nei | n ei |
| nen | n en |
| neng | n eng |
| ng | n g |
| ni | n i |
| nian | n ian |
| niang | n iang |
| niao | n iao |
| nie | n ie |
| nin | n in |
| ning | n ing |
| niu | n iu |
| nong | n ong |
| nou | n ou |
| nu | n u |
| nuan | n uan |
| nun | n un |
| nuo | n uo |
| nv | n v |
| nve | n ve |
| o | o |
| ou | ou |
| pa | p a |
| pai | p ai |
| pan | p an |
| pang | p ang |
| pao | p ao |
| pei | p ei |
| pen | p en |
| peng | p eng |
| pi | p i |
| pian | p ian |
| piao | p iao |
| pie | p ie |
| pin | p in |
| ping | p ing |
| po | p o |
| pou | p ou |
| pu | p u |
| qi | q i |
| qia | q ia |
| qian | q ian |
| qiang | q iang |
| qiao | q iao |
| qie | q ie |
| qin | q in |
| qing | q ing |
| qiong | q iong |
| qiu | q iu |
| qu | q v |
| quan | q van |
| que | q ve |
| qun | q vn |
| ran | r an |
| rang | r ang |
| rao | r ao |
| re | r e |
| ren | r en |
| reng | r eng |
| ri | r i |
| rong | r ong |
| rou | r ou |
| ru | r u |
| rua | r ua |
| ruan | r uan |
| rui | r ui |
| run | r un |
| ruo | r uo |
| sa | s a |
| sai | s ai |
| san | s an |
| sang | s ang |
| sao | s ao |
| se | s e |
| sen | s en |
| seng | s eng |
| sha | sh a |
| shai | sh ai |
| shan | sh an |
| shang | sh ang |
| shao | sh ao |
| she | sh e |
| shei | sh ei |
| shen | sh en |
| sheng | sh eng |
| shi | sh i |
| shou | sh ou |
| shu | sh u |
| shua | sh ua |
| shuai | sh uai |
| shuan | sh uan |
| shuang | sh uang |
| shui | sh ui |
| shun | sh un |
| shuo | sh uo |
| si | s i |
| song | s ong |
| sou | s ou |
| su | s u |
| suan | s uan |
| sui | s ui |
| sun | s un |
| suo | s uo |
| ta | t a |
| tai | t ai |
| tan | t an |
| tang | t ang |
| tao | t ao |
| te | t e |
| tei | t ei |
| teng | t eng |
| ti | t i |
| tian | t ian |
| tiao | t iao |
| tie | t ie |
| ting | t ing |
| tong | t ong |
| tou | t ou |
| tu | t u |
| tuan | t uan |
| tui | t ui |
| tun | t un |
| tuo | t uo |
| wa | w a |
| wai | w ai |
| wan | w an |
| wang | w ang |
| wei | w ei |
| wen | w en |
| weng | w eng |
| wo | w o |
| wu | w u |
| xi | x i |
| xia | x ia |
| xian | x ian |
| xiang | x iang |
| xiao | x iao |
| xie | x ie |
| xin | x in |
| xing | x ing |
| xiong | x iong |
| xiu | x iu |
| xu | x v |
| xuan | x van |
| xue | x ve |
| xun | x vn |
| ya | y a |
| yan | y an |
| yang | y ang |
| yao | y ao |
| ye | y e |
| yi | y i |
| yin | y in |
| ying | y ing |
| yo | y o |
| yong | y ong |
| you | y ou |
| yu | y v |
| yuan | y van |
| yue | y ve |
| yun | y vn |
| za | z a |
| zai | z ai |
| zan | z an |
| zang | z ang |
| zao | z ao |
| ze | z e |
| zei | z ei |
| zen | z en |
| zeng | z eng |
| zha | zh a |
| zhai | zh ai |
| zhan | zh an |
| zhang | zh ang |
| zhao | zh ao |
| zhe | zh e |
| zhei | zh ei |
| zhen | zh en |
| zheng | zh eng |
| zhi | zh i |
| zhong | zh ong |
| zhou | zh ou |
| zhu | zh u |
| zhua | zh ua |
| zhuai | zh uai |
| zhuan | zh uan |
| zhuang | zh uang |
| zhui | zh ui |
| zhun | zh un |
| zhuo | zh uo |
| zi | z i |
| zong | z ong |
| zou | z ou |
| zu | z u |
| zuan | z uan |
| zui | z ui |
| zun | z un |
| zuo | z uo |
================================================
FILE: NeuralSeq/inference/svs/opencpop/map.py
================================================
def cpop_pinyin2ph_func():
# In the README file of opencpop dataset, they defined a "pinyin to phoneme mapping table"
pinyin2phs = {'AP': 'AP', 'SP': 'SP'}
with open('NeuralSeq/inference/svs/opencpop/cpop_pinyin2ph.txt') as rf:
for line in rf.readlines():
elements = [x.strip() for x in line.split('|') if x.strip() != '']
pinyin2phs[elements[0]] = elements[1]
return pinyin2phs
================================================
FILE: NeuralSeq/inference/tts/GenerSpeech.py
================================================
import torch
import os
import importlib
from inference.tts.base_tts_infer import BaseTTSInfer
from utils.ckpt_utils import load_ckpt, get_last_checkpoint
from modules.GenerSpeech.model.generspeech import GenerSpeech
from data_gen.tts.emotion import inference as EmotionEncoder
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
from data_gen.tts.emotion.inference import preprocess_wav
from data_gen.tts.data_gen_utils import is_sil_phoneme
from resemblyzer import VoiceEncoder
from utils import audio
class GenerSpeechInfer(BaseTTSInfer):
def build_model(self):
model = GenerSpeech(self.ph_encoder)
model.eval()
load_ckpt(model, self.hparams['work_dir'], 'model')
return model
def preprocess_input(self, inp):
"""
:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
:return:
"""
# processed text
preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
text_raw = inp['text']
item_name = inp.get('item_name', '<ITEM_NAME>')
ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph(preprocessor.txt_processor, text_raw, preprocess_args)
ph_token = self.ph_encoder.encode(ph)
# processed ref audio
ref_audio = inp['ref_audio']
processed_ref_audio = 'example/temp.wav'
voice_encoder = VoiceEncoder().cuda()
encoder = [self.ph_encoder, self.word_encoder]
EmotionEncoder.load_model(self.hparams['emotion_encoder_path'])
binarizer_cls = self.hparams.get("binarizer_cls", 'data_gen.tts.base_binarizerr.BaseBinarizer')
pkg = ".".join(binarizer_cls.split(".")[:-1])
cls_name = binarizer_cls.split(".")[-1]
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
ref_audio_raw, ref_text_raw = self.asr(ref_audio) # prepare text
ph_ref, txt_ref, word_ref, ph2word_ref, ph_gb_word_ref = preprocessor.txt_to_ph(preprocessor.txt_processor, ref_text_raw, preprocess_args)
ph_gb_word_nosil = ["_".join([p for p in w.split("_") if not is_sil_phoneme(p)]) for w in ph_gb_word_ref.split(" ") if not is_sil_phoneme(w)]
phs_for_align = ['SIL'] + ph_gb_word_nosil + ['SIL']
phs_for_align = " ".join(phs_for_align)
# prepare files for alignment
os.system('rm -r example/; mkdir example/')
audio.save_wav(ref_audio_raw, processed_ref_audio, self.hparams['audio_sample_rate'])
with open(f'example/temp.lab', 'w') as f_txt:
f_txt.write(phs_for_align)
os.system(f'mfa align example/ {self.hparams["binary_data_dir"]}/mfa_dict.txt {self.hparams["binary_data_dir"]}/mfa_model.zip example/textgrid/ --clean')
item2tgfn = 'example/textgrid/temp.TextGrid' # prepare textgrid alignment
item = binarizer_cls.process_item(item_name, ph_ref, txt_ref, item2tgfn, processed_ref_audio, 0, 0, encoder, self.hparams['binarization_args'])
item['emo_embed'] = Embed_utterance(preprocess_wav(item['wav_fn']))
item['spk_embed'] = voice_encoder.embed_utterance(item['wav'])
item.update({
'ref_ph': item['ph'],
'ph': ph,
'ph_token': ph_token,
'text': txt
})
return item
def input_to_batch(self, item):
item_names = [item['item_name']]
text = [item['text']]
ph = [item['ph']]
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
mels = torch.FloatTensor(item['mel'])[None, :].to(self.device)
f0 = torch.FloatTensor(item['f0'])[None, :].to(self.device)
# uv = torch.FloatTensor(item['uv']).to(self.device)
mel2ph = torch.LongTensor(item['mel2ph'])[None, :].to(self.device)
spk_embed = torch.FloatTensor(item['spk_embed'])[None, :].to(self.device)
emo_embed = torch.FloatTensor(item['emo_embed'])[None, :].to(self.device)
ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
mel2word = torch.LongTensor(item['mel2word'])[None, :].to(self.device)
word_tokens = torch.LongTensor(item['word_tokens'])[None, :].to(self.device)
batch = {
'item_name': item_names,
'text': text,
'ph': ph,
'mels': mels,
'f0': f0,
'txt_tokens': txt_tokens,
'txt_lengths': txt_lengths,
'spk_embed': spk_embed,
'emo_embed': emo_embed,
'mel2ph': mel2ph,
'ph2word': ph2word,
'mel2word': mel2word,
'word_tokens': word_tokens,
}
return batch
def forward_model(self, inp):
sample = self.input_to_batch(inp)
txt_tokens = sample['txt_tokens'] # [B, T_t]
with torch.no_grad():
output = self.model(txt_tokens, ref_mel2ph=sample['mel2ph'], ref_mel2word=sample['mel2word'], ref_mels=sample['mels'],
spk_embed=sample['spk_embed'], emo_embed=sample['emo_embed'], global_steps=300000, infer=True)
mel_out = output['mel_out']
wav_out = self.run_vocoder(mel_out)
wav_out = wav_out.squeeze().cpu().numpy()
return wav_out
if __name__ == '__main__':
inp = {
'text': 'here we go',
'ref_audio': 'assets/0011_001570.wav'
}
GenerSpeechInfer.example_run(inp)
================================================
FILE: NeuralSeq/inference/tts/PortaSpeech.py
================================================
import torch
from inference.tts.base_tts_infer import BaseTTSInfer
from utils.ckpt_utils import load_ckpt
from modules.portaspeech.portaspeech import PortaSpeech
class TTSInference(BaseTTSInfer):
def __init__(self, hparams, device=None):
super().__init__(hparams, device)
print("Initializing TTS model to %s" % device)
self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
print("TTS loaded!")
def build_model(self):
model = PortaSpeech(self.ph_encoder, self.word_encoder)
load_ckpt(model, self.hparams['work_dir'], 'model')
with torch.no_grad():
model.store_inverse_all()
return model
def forward_model(self, inp):
sample = self.input_to_batch(inp)
with torch.no_grad():
output = self.model(
sample['txt_tokens'],
sample['word_tokens'],
ph2word=sample['ph2word'],
word_len=sample['word_lengths'].max(),
infer=True,
forward_post_glow=True,
spk_id=sample.get('spk_ids')
)
mel_out = output['mel_out']
wav_out = self.run_vocoder(mel_out)
wav_out = wav_out.cpu().numpy()
return wav_out[0]
def preprocess_input(self, inp):
"""
:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
:return:
"""
preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
text_raw = inp['text']
item_name = inp.get('item_name', '<ITEM_NAME>')
spk_name = inp.get('spk_name', '<SINGLE_SPK>')
ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph(
preprocessor.txt_processor, text_raw, preprocess_args)
word_token = self.word_encoder.encode(word)
ph_token = self.ph_encoder.encode(ph)
spk_id = self.spk_map[spk_name]
item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id,
'ph_token': ph_token, 'word_token': word_token, 'ph2word': ph2word,
'ph_words':ph_gb_word, 'words': word}
item['ph_len'] = len(item['ph_token'])
return item
def input_to_batch(self, item):
item_names = [item['item_name']]
text = [item['text']]
ph = [item['ph']]
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
word_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
batch = {
'item_name': item_names,
'text': text,
'ph': ph,
'txt_tokens': txt_tokens,
'txt_lengths': txt_lengths,
'word_tokens': word_tokens,
'word_lengths': word_lengths,
'ph2word': ph2word,
'spk_ids': spk_ids,
}
return batch
def postprocess_output(self, output):
return output
================================================
FILE: NeuralSeq/inference/tts/base_tts_infer.py
================================================
from tasks.tts.dataset_utils import FastSpeechWordDataset
from tasks.tts.tts_utils import load_data_preprocessor
from vocoders.hifigan import HifiGanGenerator
import os
import librosa
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from string import punctuation
import torch
from utils.ckpt_utils import load_ckpt
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
class BaseTTSInfer:
def __init__(self, hparams, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.hparams = hparams
self.device = device
self.data_dir = hparams['binary_data_dir']
self.preprocessor, self.preprocess_args = load_data_preprocessor()
self.ph_encoder, self.word_encoder = self.preprocessor.load_dict(self.data_dir)
self.ds_cls = FastSpeechWordDataset
self.model = self.build_model()
self.model.eval()
self.model.to(self.device)
self.vocoder = self.build_vocoder()
self.vocoder.eval()
self.vocoder.to(self.device)
self.asr_processor, self.asr_model = self.build_asr()
def build_model(self):
raise NotImplementedError
def forward_model(self, inp):
raise NotImplementedError
def build_asr(self):
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") # facebook/wav2vec2-base-960h wav2vec2-large-960h-lv60-self
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(self.device)
return processor, model
def build_vocoder(self):
base_dir = self.hparams['vocoder_ckpt']
config_path = f'{base_dir}/config.yaml'
config = set_hparams(config_path, global_hparams=False)
vocoder = HifiGanGenerator(config)
load_ckpt(vocoder, base_dir, 'model_gen')
return vocoder
def run_vocoder(self, c):
c = c.transpose(2, 1)
y = self.vocoder(c)[:, 0]
return y
def preprocess_input(self, inp):
raise NotImplementedError
def input_to_batch(self, item):
raise NotImplementedError
def postprocess_output(self, output):
return output
def infer_once(self, inp):
inp = self.preprocess_input(inp)
output = self.forward_model(inp)
output = self.postprocess_output(output)
return output
@classmethod
def example_run(cls, inp):
from utils.audio import save_wav
#set_hparams(print_hparams=False)
infer_ins = cls(hp)
out = infer_ins.infer_once(inp)
os.makedirs('infer_out', exist_ok=True)
save_wav(out, f'infer_out/{hp["text"]}.wav', hp['audio_sample_rate'])
print(f'Save at infer_out/{hp["text"]}.wav.')
def asr(self, file):
sample_rate = self.hparams['audio_sample_rate']
audio_input, source_sample_rate = sf.read(file)
# Resample the wav if needed
if sample_rate is not None and source_sample_rate != sample_rate:
audio_input = librosa.resample(audio_input, source_sample_rate, sample_rate)
# pad input values and return pt tensor
input_values = self.asr_processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values
# retrieve logits & take argmax
logits = self.asr_model(input_values.cuda()).logits
predicted_ids = torch.argmax(logits, dim=-1)
# transcribe
transcription = self.asr_processor.decode(predicted_ids[0])
transcription = transcription.rstrip(punctuation)
return audio_input, transcription
================================================
FILE: NeuralSeq/modules/GenerSpeech/config/generspeech.yaml
================================================
base_config:
- egs/egs_bases/tts/fs2.yaml
- egs/datasets/audio/emotion/base_text2mel.yaml
task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
# emotion encoder
emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path
# vocoder
vocoder: hifigan
vocoder_ckpt: checkpoints/trainset_hifigan
# dataset
raw_data_dir: 'data/raw/training_set'
processed_data_dir: 'data/processed/training_set'
binary_data_dir: 'data/binary/training_set'
test_input_dir: ''
# process
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
audio_sample_rate: 16000
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
fmax: 7600 # To be increased/reduced depending on data.
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
min_level_db: -100
ref_level_db: 20
binarization_args:
reset_phone_dict: true
reset_word_dict: true
shuffle: true
trim_eos_bos: false
trim_sil: false
with_align: true
with_f0: true
with_f0cwt: false
with_linear: false
with_spk_embed: true
with_spk_id: true
with_txt: true
with_wav: true
with_word: true
preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
preprocess_args:
nsample_per_mfa_group: 1000
# text process
txt_processor: en
use_mfa: true
with_phsep: true
reset_phone_dict: true
reset_word_dict: true
add_eos_bos: true
# mfa
mfa_group_shuffle: false
mfa_offset: 0.02
# wav processors
wav_processors: []
save_sil_mask: true
vad_max_silence_length: 12
# data
word_dict_size: 10000
num_spk: 500
use_spk_embed: true
use_spk_id: false
use_word: true
use_emotion: true
use_gt_dur: false
ref_audio: ''
text: ''
# training
num_sanity_val_steps: -1
max_updates: 300000
max_sentences: 100000
num_test_samples: 72
## glow
post_glow_hidden: 128
post_glow_kernel_size: 3
post_glow_n_blocks: 8
post_glow_n_block_layers: 3
share_wn_layers: 4
sigmoid_scale: false
post_share_cond_layers: false
use_txt_cond: true
use_latent_cond: true
noise_scale: 0.8
# prosody extractor
lambda_commit: 0.25
vq_start: 20500
vae_dropout: 0.0
nVQ: 128
forcing: 20000
crop: false
predictor_grad: 1.0
================================================
FILE: NeuralSeq/modules/GenerSpeech/model/generspeech.py
================================================
import torch
from modules.GenerSpeech.model.glow_modules import Glow
from modules.fastspeech.tts_modules import PitchPredictor
import random
from modules.GenerSpeech.model.prosody_util import ProsodyAligner, LocalStyleAdaptor
from utils.pitch_utils import f0_to_coarse, denorm_f0
from modules.commons.common_layers import *
import torch.distributions as dist
from utils.hparams import hparams
from modules.GenerSpeech.model.mixstyle import MixStyle
from modules.fastspeech.fs2 import FastSpeech2
import json
from modules.fastspeech.tts_modules import DEFAULT_MAX_SOURCE_POSITIONS, DEFAULT_MAX_TARGET_POSITIONS
class GenerSpeech(FastSpeech2):
'''
GenerSpeech: Towards Style Transfer for Generalizable Out-Of-Domain Text-to-Speech
https://arxiv.org/abs/2205.07211
'''
def __init__(self, dictionary, out_dims=None):
super().__init__(dictionary, out_dims)
# Mixstyle
self.norm = MixStyle(p=0.5, alpha=0.1, eps=1e-6, hidden_size=self.hidden_size)
# emotion embedding
self.emo_embed_proj = Linear(256, self.hidden_size, bias=True)
# build prosody extractor
## frame level
self.prosody_extractor_utter = LocalStyleAdaptor(self.hidden_size, hparams['nVQ'], self.padding_idx)
self.l1_utter = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.align_utter = ProsodyAligner(num_layers=2)
## phoneme level
self.prosody_extractor_ph = LocalStyleAdaptor(self.hidden_size, hparams['nVQ'], self.padding_idx)
self.l1_ph = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.align_ph = ProsodyAligner(num_layers=2)
## word level
self.prosody_extractor_word = LocalStyleAdaptor(self.hidden_size, hparams['nVQ'], self.padding_idx)
self.l1_word = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.align_word = ProsodyAligner(num_layers=2)
self.pitch_inpainter_predictor = PitchPredictor(
self.hidden_size, n_chans=self.hidden_size,
n_layers=3, dropout_rate=0.1, odim=2,
padding=hparams['ffn_padding'], kernel_size=hparams['predictor_kernel'])
# build attention layer
self.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
self.embed_positions = SinusoidalPositionalEmbedding(
self.hidden_size, self.padding_idx,
init_size=self.max_source_positions + self.padding_idx + 1,
)
# build post flow
cond_hs = 80
if hparams.get('use_txt_cond', True):
cond_hs = cond_hs + hparams['hidden_size']
cond_hs = cond_hs + hparams['hidden_size'] * 3 # for emo, spk embedding and prosody embedding
self.post_flow = Glow(
80, hparams['post_glow_hidden'], hparams['post_glow_kernel_size'], 1,
hparams['post_glow_n_blocks'], hparams['post_glow_n_block_layers'],
n_split=4, n_sqz=2,
gin_channels=cond_hs,
share_cond_layers=hparams['post_share_cond_layers'],
share_wn_layers=hparams['share_wn_layers'],
sigmoid_scale=hparams['sigmoid_scale']
)
self.prior_dist = dist.Normal(0, 1)
def forward(self, txt_tokens, mel2ph=None, ref_mel2ph=None, ref_mel2word=None, spk_embed=None, emo_embed=None, ref_mels=None,
f0=None, uv=None, skip_decoder=False, global_steps=0, infer=False, **kwargs):
ret = {}
encoder_out = self.encoder(txt_tokens) # [B, T, C]
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
# add spk/emo embed
spk_embed = self.spk_embed_proj(spk_embed)[:, None, :]
emo_embed = self.emo_embed_proj(emo_embed)[:, None, :]
# add dur
dur_inp = (encoder_out + spk_embed + emo_embed) * src_nonpadding
mel2ph = self.add_dur(dur_inp, mel2ph, txt_tokens, ret)
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
gitextract_mez1vc95/
├── .gitignore
├── LICENSE
├── NeuralSeq/
│ ├── LICENSE
│ ├── README.md
│ ├── configs/
│ │ ├── config_base.yaml
│ │ ├── singing/
│ │ │ ├── base.yaml
│ │ │ └── fs2.yaml
│ │ └── tts/
│ │ ├── base.yaml
│ │ ├── base_zh.yaml
│ │ ├── emotion/
│ │ │ ├── base_text2mel.yaml
│ │ │ └── pre_align.py
│ │ ├── fs2.yaml
│ │ ├── hifigan.yaml
│ │ ├── libritts/
│ │ │ ├── base_text2mel.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── pre_align.py
│ │ │ └── pwg.yaml
│ │ ├── lj/
│ │ │ ├── base_mel2wav.yaml
│ │ │ ├── base_text2mel.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── hifigan.yaml
│ │ │ └── pwg.yaml
│ │ └── pwg.yaml
│ ├── data_gen/
│ │ └── tts/
│ │ ├── base_binarizer.py
│ │ ├── base_binarizer_emotion.py
│ │ ├── base_preprocess.py
│ │ ├── binarizer_zh.py
│ │ ├── data_gen_utils.py
│ │ ├── emotion/
│ │ │ ├── audio.py
│ │ │ ├── inference.py
│ │ │ ├── model.py
│ │ │ ├── params_data.py
│ │ │ ├── params_model.py
│ │ │ └── test_emotion.py
│ │ ├── txt_processors/
│ │ │ ├── __init__.py
│ │ │ ├── base_text_processor.py
│ │ │ ├── en.py
│ │ │ ├── zh.py
│ │ │ └── zh_g2pM.py
│ │ └── wav_processors/
│ │ ├── __init__.py
│ │ ├── base_processor.py
│ │ └── common_processors.py
│ ├── egs/
│ │ ├── datasets/
│ │ │ └── audio/
│ │ │ ├── emotion/
│ │ │ │ ├── base_text2mel.yaml
│ │ │ │ └── pre_align.py
│ │ │ ├── libritts/
│ │ │ │ ├── base_text2mel.yaml
│ │ │ │ ├── fs2.yaml
│ │ │ │ ├── pre_align.py
│ │ │ │ └── pwg.yaml
│ │ │ ├── lj/
│ │ │ │ ├── base_mel2wav.yaml
│ │ │ │ ├── preprocess.py
│ │ │ │ └── pwg.yaml
│ │ │ └── vctk/
│ │ │ ├── base_mel2wav.yaml
│ │ │ ├── fs2.yaml
│ │ │ ├── pre_align.py
│ │ │ └── pwg.yaml
│ │ └── egs_bases/
│ │ ├── config_base.yaml
│ │ ├── svs/
│ │ │ ├── base.yaml
│ │ │ ├── lj_ds_beta6.yaml
│ │ │ ├── midi/
│ │ │ │ ├── cascade/
│ │ │ │ │ └── opencs/
│ │ │ │ │ ├── aux_rel.yaml
│ │ │ │ │ ├── ds60_rel.yaml
│ │ │ │ │ └── opencpop_statis.yaml
│ │ │ │ ├── e2e/
│ │ │ │ │ ├── opencpop/
│ │ │ │ │ │ ├── ds1000-10dil.yaml
│ │ │ │ │ │ ├── ds1000.yaml
│ │ │ │ │ │ └── ds100_adj_rel.yaml
│ │ │ │ │ └── popcs/
│ │ │ │ │ └── ds100_adj_rel.yaml
│ │ │ │ └── pe.yaml
│ │ │ ├── popcs_ds_beta6.yaml
│ │ │ ├── popcs_ds_beta6_offline.yaml
│ │ │ └── popcs_fs2.yaml
│ │ └── tts/
│ │ ├── base.yaml
│ │ ├── base_zh.yaml
│ │ ├── fs2.yaml
│ │ ├── fs2_adv.yaml
│ │ ├── ps.yaml
│ │ ├── ps_flow.yaml
│ │ ├── ps_flow_small.yaml
│ │ └── vocoder/
│ │ ├── base.yaml
│ │ ├── hifigan.yaml
│ │ └── pwg.yaml
│ ├── gitattributes
│ ├── inference/
│ │ ├── svs/
│ │ │ ├── base_svs_infer.py
│ │ │ ├── ds_cascade.py
│ │ │ ├── ds_e2e.py
│ │ │ └── opencpop/
│ │ │ ├── cpop_pinyin2ph.txt
│ │ │ └── map.py
│ │ └── tts/
│ │ ├── GenerSpeech.py
│ │ ├── PortaSpeech.py
│ │ └── base_tts_infer.py
│ ├── modules/
│ │ ├── GenerSpeech/
│ │ │ ├── config/
│ │ │ │ └── generspeech.yaml
│ │ │ ├── model/
│ │ │ │ ├── generspeech.py
│ │ │ │ ├── glow_modules.py
│ │ │ │ ├── mixstyle.py
│ │ │ │ ├── prosody_util.py
│ │ │ │ └── wavenet.py
│ │ │ └── task/
│ │ │ ├── dataset.py
│ │ │ └── generspeech.py
│ │ ├── __init__.py
│ │ ├── commons/
│ │ │ ├── align_ops.py
│ │ │ ├── common_layers.py
│ │ │ ├── conv.py
│ │ │ ├── espnet_positional_embedding.py
│ │ │ ├── normalizing_flow/
│ │ │ │ ├── glow_modules.py
│ │ │ │ ├── res_flow.py
│ │ │ │ └── utils.py
│ │ │ ├── rel_transformer.py
│ │ │ ├── ssim.py
│ │ │ ├── transformer.py
│ │ │ └── wavenet.py
│ │ ├── diff/
│ │ │ ├── candidate_decoder.py
│ │ │ ├── diffusion.py
│ │ │ ├── net.py
│ │ │ └── shallow_diffusion_tts.py
│ │ ├── diffsinger_midi/
│ │ │ └── fs2.py
│ │ ├── fastspeech/
│ │ │ ├── fs2.py
│ │ │ ├── pe.py
│ │ │ └── tts_modules.py
│ │ ├── hifigan/
│ │ │ ├── hifigan.py
│ │ │ └── mel_utils.py
│ │ ├── parallel_wavegan/
│ │ │ ├── __init__.py
│ │ │ ├── layers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── causal_conv.py
│ │ │ │ ├── pqmf.py
│ │ │ │ ├── residual_block.py
│ │ │ │ ├── residual_stack.py
│ │ │ │ ├── tf_layers.py
│ │ │ │ └── upsample.py
│ │ │ ├── losses/
│ │ │ │ ├── __init__.py
│ │ │ │ └── stft_loss.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── melgan.py
│ │ │ │ ├── parallel_wavegan.py
│ │ │ │ └── source.py
│ │ │ ├── optimizers/
│ │ │ │ ├── __init__.py
│ │ │ │ └── radam.py
│ │ │ ├── stft_loss.py
│ │ │ └── utils/
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ └── syntaspeech/
│ │ ├── multi_window_disc.py
│ │ ├── syntactic_graph_buider.py
│ │ ├── syntactic_graph_encoder.py
│ │ └── syntaspeech.py
│ ├── tasks/
│ │ ├── base_task.py
│ │ ├── run.py
│ │ ├── svs/
│ │ │ ├── __init__.py
│ │ │ ├── diffsinger_task.py
│ │ │ ├── diffspeech_task.py
│ │ │ └── task.py
│ │ ├── tts/
│ │ │ ├── dataset_utils.py
│ │ │ ├── fs2.py
│ │ │ ├── fs2_adv.py
│ │ │ ├── fs2_utils.py
│ │ │ ├── pe.py
│ │ │ ├── ps.py
│ │ │ ├── ps_adv.py
│ │ │ ├── ps_flow.py
│ │ │ ├── synta.py
│ │ │ ├── tts.py
│ │ │ ├── tts_base.py
│ │ │ └── tts_utils.py
│ │ └── vocoder/
│ │ ├── dataset_utils.py
│ │ └── vocoder_base.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── ckpt_utils.py
│ │ ├── cwt.py
│ │ ├── dtw.py
│ │ ├── hparams.py
│ │ ├── indexed_datasets.py
│ │ ├── multiprocess_utils.py
│ │ ├── os_utils.py
│ │ ├── pitch_utils.py
│ │ ├── pl_utils.py
│ │ ├── plot.py
│ │ ├── text_encoder.py
│ │ ├── text_norm.py
│ │ ├── training_utils.py
│ │ └── tts_utils.py
│ └── vocoders/
│ ├── __init__.py
│ ├── base_vocoder.py
│ ├── hifigan.py
│ ├── pwg.py
│ └── vocoder_utils.py
├── README.md
├── assets/
│ └── README.md
├── audio-chatgpt.py
├── audio_detection/
│ ├── __init__.py
│ ├── audio_infer/
│ │ ├── __init__.py
│ │ ├── metadata/
│ │ │ ├── black_list/
│ │ │ │ ├── groundtruth_weak_label_evaluation_set.csv
│ │ │ │ └── groundtruth_weak_label_testing_set.csv
│ │ │ └── class_labels_indices.csv
│ │ ├── pytorch/
│ │ │ ├── evaluate.py
│ │ │ ├── finetune_template.py
│ │ │ ├── inference.py
│ │ │ ├── losses.py
│ │ │ ├── main.py
│ │ │ ├── models.py
│ │ │ └── pytorch_utils.py
│ │ └── utils/
│ │ ├── config.py
│ │ ├── crash.py
│ │ ├── create_black_list.py
│ │ ├── create_indexes.py
│ │ ├── data_generator.py
│ │ ├── dataset.py
│ │ ├── plot_for_paper.py
│ │ ├── plot_statistics.py
│ │ └── utilities.py
│ └── target_sound_detection/
│ └── src/
│ ├── models.py
│ └── utils.py
├── audio_to_text/
│ ├── __init__.py
│ ├── captioning/
│ │ ├── __init__.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── base_model.py
│ │ │ ├── decoder.py
│ │ │ ├── encoder.py
│ │ │ ├── transformer_model.py
│ │ │ └── utils.py
│ │ └── utils/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── bert/
│ │ │ ├── create_sent_embedding.py
│ │ │ └── create_word_embedding.py
│ │ ├── build_vocab.py
│ │ ├── build_vocab_ltp.py
│ │ ├── build_vocab_spacy.py
│ │ ├── eval_round_robin.py
│ │ ├── fasttext/
│ │ │ └── create_word_embedding.py
│ │ ├── lr_scheduler.py
│ │ ├── model_eval_diff.py
│ │ ├── predict_nn.py
│ │ ├── remove_optimizer.py
│ │ ├── report_results.py
│ │ ├── tokenize_caption.py
│ │ ├── train_util.py
│ │ └── word2vec/
│ │ └── create_word_embedding.py
│ └── inference_waveform.py
├── download.sh
├── mono2binaural/
│ └── src/
│ ├── models.py
│ ├── utils.py
│ └── warping.py
├── requirements.txt
├── run.md
├── sound_extraction/
│ ├── model/
│ │ ├── LASSNet.py
│ │ ├── film.py
│ │ ├── modules.py
│ │ ├── resunet_film.py
│ │ └── text_encoder.py
│ └── utils/
│ ├── create_mixtures.py
│ ├── stft.py
│ └── wav_io.py
└── text_to_audio/
└── Make_An_Audio/
├── configs/
│ ├── img_to_audio/
│ │ └── img2audio_args.yaml
│ ├── inpaint/
│ │ └── txt2audio_args.yaml
│ └── text_to_audio/
│ ├── clap_args.yaml
│ ├── hifigan_args.yaml
│ └── txt2audio_args.yaml
├── ldm/
│ ├── data/
│ │ └── extract_mel_spectrogram.py
│ ├── lr_scheduler.py
│ ├── models/
│ │ ├── autoencoder.py
│ │ ├── autoencoder_multi.py
│ │ └── diffusion/
│ │ ├── __init__.py
│ │ ├── classifier.py
│ │ ├── ddim.py
│ │ ├── ddpm.py
│ │ ├── ddpm_audio.py
│ │ ├── ddpm_audio_inpaint.py
│ │ └── plms.py
│ ├── modules/
│ │ ├── attention.py
│ │ ├── diffusionmodules/
│ │ │ ├── __init__.py
│ │ │ ├── custom_openaimodel.py
│ │ │ ├── model.py
│ │ │ ├── openaimodel.py
│ │ │ └── util.py
│ │ ├── discriminator/
│ │ │ ├── model.py
│ │ │ └── multi_window_disc.py
│ │ ├── distributions/
│ │ │ ├── __init__.py
│ │ │ └── distributions.py
│ │ ├── ema.py
│ │ ├── encoders/
│ │ │ ├── CLAP/
│ │ │ │ ├── CLAPWrapper.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio.py
│ │ │ │ ├── clap.py
│ │ │ │ ├── config.yml
│ │ │ │ └── utils.py
│ │ │ ├── __init__.py
│ │ │ ├── modules.py
│ │ │ └── open_clap/
│ │ │ ├── __init__.py
│ │ │ ├── bert.py
│ │ │ ├── factory.py
│ │ │ ├── feature_fusion.py
│ │ │ ├── htsat.py
│ │ │ ├── linear_probe.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── model_configs/
│ │ │ │ ├── HTSAT-base.json
│ │ │ │ ├── HTSAT-large.json
│ │ │ │ ├── HTSAT-tiny-win-1536.json
│ │ │ │ ├── HTSAT-tiny.json
│ │ │ │ ├── PANN-10.json
│ │ │ │ ├── PANN-14-fmax-18k.json
│ │ │ │ ├── PANN-14-fmax-8k-20s.json
│ │ │ │ ├── PANN-14-tiny-transformer.json
│ │ │ │ ├── PANN-14-win-1536.json
│ │ │ │ ├── PANN-14.json
│ │ │ │ ├── PANN-6.json
│ │ │ │ ├── RN101-quickgelu.json
│ │ │ │ ├── RN101.json
│ │ │ │ ├── RN50-quickgelu.json
│ │ │ │ ├── RN50.json
│ │ │ │ ├── RN50x16.json
│ │ │ │ ├── RN50x4.json
│ │ │ │ ├── ViT-B-16.json
│ │ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ │ ├── ViT-B-32.json
│ │ │ │ └── ViT-L-14.json
│ │ │ ├── openai.py
│ │ │ ├── pann_model.py
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ ├── utils.py
│ │ │ └── version.py
│ │ ├── image_degradation/
│ │ │ ├── __init__.py
│ │ │ ├── bsrgan.py
│ │ │ ├── bsrgan_light.py
│ │ │ └── utils_image.py
│ │ ├── losses_audio/
│ │ │ ├── __init__.py
│ │ │ ├── contperceptual.py
│ │ │ ├── contperceptual_dis.py
│ │ │ ├── lpaps.py
│ │ │ ├── vggishish/
│ │ │ │ ├── config/
│ │ │ │ │ ├── melception.yaml
│ │ │ │ │ └── vggish.yaml
│ │ │ │ ├── data/
│ │ │ │ │ ├── train_means_stds_melspec_10s_22050hz.txt
│ │ │ │ │ ├── vggsound.csv
│ │ │ │ │ ├── vggsound_test.txt
│ │ │ │ │ ├── vggsound_train.txt
│ │ │ │ │ └── vggsound_valid.txt
│ │ │ │ ├── dataset.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── model.py
│ │ │ │ ├── predict.py
│ │ │ │ ├── train_melception.py
│ │ │ │ ├── train_vggishish.py
│ │ │ │ └── transforms.py
│ │ │ └── vqperceptual.py
│ │ └── x_transformer.py
│ └── util.py
├── useful_ckpts/
│ └── CLAP/
│ └── config.yml
├── vocoder/
│ ├── bigvgan/
│ │ ├── __init__.py
│ │ ├── activations.py
│ │ ├── alias_free_torch/
│ │ │ ├── __init__.py
│ │ │ ├── act.py
│ │ │ ├── filter.py
│ │ │ └── resample.py
│ │ └── models.py
│ ├── hifigan/
│ │ └── modules.py
│ └── logs/
│ └── hifi_0127/
│ └── args.yml
└── wav_evaluation/
└── models/
├── CLAPWrapper.py
├── __init__.py
├── audio.py
├── clap.py
└── utils.py
Showing preview only (261K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (3295 symbols across 224 files)
FILE: NeuralSeq/configs/tts/emotion/pre_align.py
class EmoPreAlign (line 7) | class EmoPreAlign(BasePreprocessor):
method meta_data (line 9) | def meta_data(self):
FILE: NeuralSeq/configs/tts/libritts/pre_align.py
class LibrittsPreAlign (line 7) | class LibrittsPreAlign(BasePreprocessor):
method meta_data (line 8) | def meta_data(self):
FILE: NeuralSeq/data_gen/tts/base_binarizer.py
class BinarizationError (line 18) | class BinarizationError(Exception):
class BaseBinarizer (line 22) | class BaseBinarizer:
method __init__ (line 23) | def __init__(self, processed_data_dir=None):
method train_item_names (line 60) | def train_item_names(self):
method valid_item_names (line 64) | def valid_item_names(self):
method test_item_names (line 68) | def test_item_names(self):
method build_spk_map (line 71) | def build_spk_map(self):
method item_name2spk_id (line 80) | def item_name2spk_id(self, item_name):
method _phone_encoder (line 83) | def _phone_encoder(self):
method meta_data (line 96) | def meta_data(self, prefix):
method process (line 111) | def process(self):
method process_data (line 123) | def process_data(self, prefix):
method process_item (line 160) | def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encod...
method get_align (line 188) | def get_align(tg_fn, ph, mel, phone_encoded, res):
method get_pitch (line 200) | def get_pitch(wav, mel, res):
method get_f0cwt (line 208) | def get_f0cwt(f0, res):
FILE: NeuralSeq/data_gen/tts/base_binarizer_emotion.py
class BinarizationError (line 24) | class BinarizationError(Exception):
class EmotionBinarizer (line 28) | class EmotionBinarizer:
method __init__ (line 29) | def __init__(self, processed_data_dir=None):
method load_meta_data (line 42) | def load_meta_data(self):
method train_item_names (line 64) | def train_item_names(self):
method valid_item_names (line 68) | def valid_item_names(self):
method test_item_names (line 72) | def test_item_names(self):
method build_spk_map (line 75) | def build_spk_map(self):
method build_emo_map (line 85) | def build_emo_map(self):
method item_name2spk_id (line 94) | def item_name2spk_id(self, item_name):
method item_name2emo_id (line 97) | def item_name2emo_id(self, item_name):
method _phone_encoder (line 100) | def _phone_encoder(self):
method _word_encoder (line 114) | def _word_encoder(self):
method meta_data (line 133) | def meta_data(self, prefix):
method process (line 149) | def process(self):
method process_data (line 172) | def process_data(self, prefix):
method process_item (line 215) | def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, emoti...
method get_align (line 262) | def get_align(tg_fn, res):
method get_pitch (line 277) | def get_pitch(res):
method get_f0cwt (line 286) | def get_f0cwt(res):
method get_word (line 301) | def get_word(res, word_encoder):
method num_workers (line 346) | def num_workers(self):
FILE: NeuralSeq/data_gen/tts/base_preprocess.py
class BasePreprocessor (line 19) | class BasePreprocessor:
method __init__ (line 20) | def __init__(self):
method meta_data (line 28) | def meta_data(self):
method process (line 34) | def process(self):
method preprocess_first_pass (line 117) | def preprocess_first_pass(cls, item_name, txt_raw, txt_processor,
method txt_to_ph (line 147) | def txt_to_ph(txt_processor, txt_raw, preprocess_args):
method process_wav (line 157) | def process_wav(item_name, wav_fn, processed_dir, wav_processed_tmp, p...
method _phone_encoder (line 179) | def _phone_encoder(self, ph_set):
method _word_encoder (line 190) | def _word_encoder(self, word_set):
method preprocess_second_pass (line 208) | def preprocess_second_pass(cls, word, ph, spk_name, word_encoder, ph_e...
method build_spk_map (line 214) | def build_spk_map(self, spk_names):
method build_mfa_inputs (line 222) | def build_mfa_inputs(cls, item, mfa_input_dir, mfa_group, wav_processe...
method load_spk_map (line 238) | def load_spk_map(self, base_dir):
method load_dict (line 243) | def load_dict(self, base_dir):
method meta_csv_filename (line 249) | def meta_csv_filename(self):
method wav_processed_dirname (line 253) | def wav_processed_dirname(self):
FILE: NeuralSeq/data_gen/tts/binarizer_zh.py
class ZhBinarizer (line 12) | class ZhBinarizer(BaseBinarizer):
method get_align (line 14) | def get_align(tg_fn, ph, mel, phone_encoded, res):
FILE: NeuralSeq/data_gen/tts/data_gen_utils.py
function trim_long_silences (line 27) | def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, v...
function process_utterance (line 93) | def process_utterance(wav_path,
function get_pitch (line 150) | def get_pitch(wav_data, mel, hparams):
function remove_empty_lines (line 187) | def remove_empty_lines(text):
class TextGrid (line 197) | class TextGrid(object):
method __init__ (line 198) | def __init__(self, text):
method _extract_pattern (line 208) | def _extract_pattern(self, pattern, inc):
method _get_type (line 225) | def _get_type(self):
method _get_time_intval (line 228) | def _get_time_intval(self):
method _get_size (line 232) | def _get_size(self):
method _get_item_list (line 235) | def _get_item_list(self):
method toJson (line 264) | def toJson(self):
function get_mel2ph (line 274) | def get_mel2ph(tg_fn, ph, mel, hparams):
function build_phone_encoder (line 340) | def build_phone_encoder(data_dir):
function build_word_encoder (line 346) | def build_word_encoder(data_dir):
function is_sil_phoneme (line 351) | def is_sil_phoneme(p):
function build_token_encoder (line 355) | def build_token_encoder(token_list_file):
FILE: NeuralSeq/data_gen/tts/emotion/audio.py
function preprocess_wav (line 13) | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
function wav_to_mel_spectrogram (line 43) | def wav_to_mel_spectrogram(wav):
function trim_long_silences (line 58) | def trim_long_silences(wav):
function normalize_volume (line 101) | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_onl...
FILE: NeuralSeq/data_gen/tts/emotion/inference.py
function load_model (line 15) | def load_model(weights_fpath: Path, device=None):
function is_loaded (line 39) | def is_loaded():
function embed_frames_batch (line 43) | def embed_frames_batch(frames_batch):
function compute_partial_slices (line 59) | def compute_partial_slices(n_samples, partial_utterance_n_frames=partial...
function embed_utterance (line 111) | def embed_utterance(wav, using_partials=True, return_partials=False, **k...
function embed_speaker (line 158) | def embed_speaker(wavs, **kwargs):
function plot_embedding_as_heatmap (line 162) | def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, colo...
FILE: NeuralSeq/data_gen/tts/emotion/model.py
class EmotionEncoder (line 11) | class EmotionEncoder(nn.Module):
method __init__ (line 12) | def __init__(self, device, loss_device):
method do_gradient_ops (line 33) | def do_gradient_ops(self):
method forward (line 41) | def forward(self, utterances, hidden_init=None):
method inference (line 63) | def inference(self, utterances, hidden_init=None):
FILE: NeuralSeq/data_gen/tts/emotion/test_emotion.py
function tuneThresholdfromScore (line 32) | def tuneThresholdfromScore(scores, labels, target_fa, target_fr=None):
function loadWAV (line 55) | def loadWAV(filename, max_frames, evalmode=True, num_eval=10):
function evaluateFromList (line 84) | def evaluateFromList(listfilename, print_interval=100, test_path='', mul...
FILE: NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py
function register_txt_processors (line 5) | def register_txt_processors(name):
function get_txt_processor_cls (line 13) | def get_txt_processor_cls(name):
class BaseTxtProcessor (line 17) | class BaseTxtProcessor:
method sp_phonemes (line 19) | def sp_phonemes():
method process (line 23) | def process(cls, txt, preprocess_args):
method postprocess (line 27) | def postprocess(cls, txt_struct, preprocess_args):
method add_bdr (line 40) | def add_bdr(cls, txt_struct):
FILE: NeuralSeq/data_gen/tts/txt_processors/en.py
class EnG2p (line 12) | class EnG2p(G2p):
method __call__ (line 15) | def __call__(self, text):
class TxtProcessor (line 44) | class TxtProcessor(BaseTxtProcessor):
method preprocess_text (line 48) | def preprocess_text(text):
method process (line 66) | def process(cls, txt, preprocess_args):
FILE: NeuralSeq/data_gen/tts/txt_processors/zh.py
class TxtProcessor (line 9) | class TxtProcessor(BaseTxtProcessor):
method preprocess_text (line 15) | def preprocess_text(text):
method process (line 28) | def process(cls, txt, pre_align_args):
FILE: NeuralSeq/data_gen/tts/txt_processors/zh_g2pM.py
class TxtProcessor (line 15) | class TxtProcessor(zh.TxtProcessor):
method sp_phonemes (line 19) | def sp_phonemes():
method process (line 23) | def process(cls, txt, pre_align_args):
FILE: NeuralSeq/data_gen/tts/wav_processors/base_processor.py
function register_wav_processors (line 4) | def register_wav_processors(name):
function get_wav_processor_cls (line 12) | def get_wav_processor_cls(name):
class BaseWavProcessor (line 16) | class BaseWavProcessor:
method name (line 18) | def name(self):
method output_fn (line 21) | def output_fn(self, input_fn):
method process (line 24) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
FILE: NeuralSeq/data_gen/tts/wav_processors/common_processors.py
class ConvertToWavProcessor (line 12) | class ConvertToWavProcessor(BaseWavProcessor):
method name (line 14) | def name(self):
method process (line 17) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
class ResampleProcessor (line 27) | class ResampleProcessor(BaseWavProcessor):
method name (line 29) | def name(self):
method process (line 32) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
class TrimSILProcessor (line 46) | class TrimSILProcessor(BaseWavProcessor):
method name (line 48) | def name(self):
method process (line 51) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
class TrimAllSILProcessor (line 60) | class TrimAllSILProcessor(BaseWavProcessor):
method name (line 62) | def name(self):
method process (line 65) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
class DenoiseProcessor (line 77) | class DenoiseProcessor(BaseWavProcessor):
method name (line 79) | def name(self):
method process (line 82) | def process(self, input_fn, sr, tmp_dir, processed_dir, item_name, pre...
FILE: NeuralSeq/egs/datasets/audio/emotion/pre_align.py
class EmoPreAlign (line 7) | class EmoPreAlign(BasePreprocessor):
method meta_data (line 9) | def meta_data(self):
FILE: NeuralSeq/egs/datasets/audio/libritts/pre_align.py
class LibrittsPreAlign (line 7) | class LibrittsPreAlign(BasePreprocessor):
method meta_data (line 8) | def meta_data(self):
FILE: NeuralSeq/egs/datasets/audio/lj/preprocess.py
class LJPreprocess (line 4) | class LJPreprocess(BasePreprocessor):
method meta_data (line 5) | def meta_data(self):
FILE: NeuralSeq/egs/datasets/audio/vctk/pre_align.py
class VCTKPreAlign (line 7) | class VCTKPreAlign(BasePreAlign):
method meta_data (line 8) | def meta_data(self):
FILE: NeuralSeq/inference/svs/base_svs_infer.py
class BaseSVSInfer (line 18) | class BaseSVSInfer:
method __init__ (line 19) | def __init__(self, hparams, device=None):
method build_model (line 40) | def build_model(self):
method forward_model (line 43) | def forward_model(self, inp):
method build_vocoder (line 46) | def build_vocoder(self):
method run_vocoder (line 61) | def run_vocoder(self, c, **kwargs):
method preprocess_word_level_input (line 72) | def preprocess_word_level_input(self, inp):
method preprocess_phoneme_level_input (line 141) | def preprocess_phoneme_level_input(self, inp):
method preprocess_input (line 155) | def preprocess_input(self, inp, input_type='word'):
method input_to_batch (line 200) | def input_to_batch(self, item):
method postprocess_output (line 225) | def postprocess_output(self, output):
method infer_once (line 228) | def infer_once(self, inp):
method example_run (line 235) | def example_run(cls, inp):
FILE: NeuralSeq/inference/svs/ds_cascade.py
class DiffSingerCascadeInfer (line 8) | class DiffSingerCascadeInfer(BaseSVSInfer):
method build_model (line 9) | def build_model(self):
method forward_model (line 22) | def forward_model(self, inp):
FILE: NeuralSeq/inference/svs/ds_e2e.py
class DiffSingerE2EInfer (line 13) | class DiffSingerE2EInfer(BaseSVSInfer):
method build_model (line 14) | def build_model(self):
method forward_model (line 32) | def forward_model(self, inp):
FILE: NeuralSeq/inference/svs/opencpop/map.py
function cpop_pinyin2ph_func (line 1) | def cpop_pinyin2ph_func():
FILE: NeuralSeq/inference/tts/GenerSpeech.py
class GenerSpeechInfer (line 13) | class GenerSpeechInfer(BaseTTSInfer):
method build_model (line 14) | def build_model(self):
method preprocess_input (line 20) | def preprocess_input(self, inp):
method input_to_batch (line 69) | def input_to_batch(self, item):
method forward_model (line 104) | def forward_model(self, inp):
FILE: NeuralSeq/inference/tts/PortaSpeech.py
class TTSInference (line 6) | class TTSInference(BaseTTSInfer):
method __init__ (line 7) | def __init__(self, hparams, device=None):
method build_model (line 13) | def build_model(self):
method forward_model (line 20) | def forward_model(self, inp):
method preprocess_input (line 37) | def preprocess_input(self, inp):
method input_to_batch (line 58) | def input_to_batch(self, item):
method postprocess_output (line 81) | def postprocess_output(self, output):
FILE: NeuralSeq/inference/tts/base_tts_infer.py
class BaseTTSInfer (line 14) | class BaseTTSInfer:
method __init__ (line 15) | def __init__(self, hparams, device=None):
method build_model (line 32) | def build_model(self):
method forward_model (line 35) | def forward_model(self, inp):
method build_asr (line 38) | def build_asr(self):
method build_vocoder (line 44) | def build_vocoder(self):
method run_vocoder (line 52) | def run_vocoder(self, c):
method preprocess_input (line 57) | def preprocess_input(self, inp):
method input_to_batch (line 60) | def input_to_batch(self, item):
method postprocess_output (line 63) | def postprocess_output(self, output):
method infer_once (line 66) | def infer_once(self, inp):
method example_run (line 73) | def example_run(cls, inp):
method asr (line 83) | def asr(self, file):
FILE: NeuralSeq/modules/GenerSpeech/model/generspeech.py
class GenerSpeech (line 15) | class GenerSpeech(FastSpeech2):
method __init__ (line 20) | def __init__(self, dictionary, out_dims=None):
method forward (line 75) | def forward(self, txt_tokens, mel2ph=None, ref_mel2ph=None, ref_mel2wo...
method get_prosody_ph (line 121) | def get_prosody_ph(self, encoder_out, ref_mels, ret, infer=False, glob...
method get_prosody_word (line 149) | def get_prosody_word(self, encoder_out, ref_mels, ret, infer=False, gl...
method get_prosody_utter (line 176) | def get_prosody_utter(self, encoder_out, ref_mels, ret, infer=False, g...
method inpaint_pitch (line 205) | def inpaint_pitch(self, pitch_inp_domain_agnostic, pitch_inp_domain_sp...
method run_post_glow (line 233) | def run_post_glow(self, tgt_mels, infer, is_training, ret):
FILE: NeuralSeq/modules/GenerSpeech/model/glow_modules.py
class LayerNorm (line 11) | class LayerNorm(nn.Module):
method __init__ (line 12) | def __init__(self, channels, eps=1e-4):
method forward (line 20) | def forward(self, x):
class ConvReluNorm (line 32) | class ConvReluNorm(nn.Module):
method __init__ (line 33) | def __init__(self, in_channels, hidden_channels, out_channels, kernel_...
method forward (line 57) | def forward(self, x, x_mask):
class ActNorm (line 68) | class ActNorm(nn.Module): # glow中的线性变换层
method __init__ (line 69) | def __init__(self, channels, ddi=False, **kwargs):
method forward (line 77) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method store_inverse (line 93) | def store_inverse(self):
method set_ddi (line 96) | def set_ddi(self, ddi):
method initialize (line 99) | def initialize(self, x, x_mask):
class InvConvNear (line 114) | class InvConvNear(nn.Module): # 可逆卷积
method __init__ (line 115) | def __init__(self, channels, n_split=4, no_jacobian=False, lu=True, n_...
method forward (line 147) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method _get_weight (line 184) | def _get_weight(self):
method store_inverse (line 191) | def store_inverse(self):
class InvConv (line 196) | class InvConv(nn.Module):
method __init__ (line 197) | def __init__(self, channels, no_jacobian=False, lu=True, **kwargs):
method get_weight (line 225) | def get_weight(self, device, reverse):
method forward (line 242) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method store_inverse (line 268) | def store_inverse(self):
class Flip (line 272) | class Flip(nn.Module):
method forward (line 273) | def forward(self, x, *args, reverse=False, **kwargs):
method store_inverse (line 278) | def store_inverse(self):
class CouplingBlock (line 282) | class CouplingBlock(nn.Module): # 仿射耦合层
method __init__ (line 283) | def __init__(self, in_channels, hidden_channels, kernel_size, dilation...
method forward (line 311) | def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
method store_inverse (line 334) | def store_inverse(self):
class GlowFFTBlocks (line 338) | class GlowFFTBlocks(FFTBlocks):
method __init__ (line 339) | def __init__(self, hidden_size=128, gin_channels=256, num_layers=2, ff...
method forward (line 346) | def forward(self, x, x_mask=None, g=None):
class TransformerCouplingBlock (line 361) | class TransformerCouplingBlock(nn.Module):
method __init__ (line 362) | def __init__(self, in_channels, hidden_channels, n_layers,
method forward (line 386) | def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
method store_inverse (line 409) | def store_inverse(self):
class FreqFFTCouplingBlock (line 413) | class FreqFFTCouplingBlock(nn.Module):
method __init__ (line 414) | def __init__(self, in_channels, hidden_channels, n_layers,
method forward (line 452) | def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
method store_inverse (line 492) | def store_inverse(self):
class Glow (line 496) | class Glow(nn.Module):
method __init__ (line 497) | def __init__(self,
method forward (line 556) | def forward(self, x, x_mask=None, g=None, reverse=False, return_hidden...
method store_inverse (line 582) | def store_inverse(self):
class GlowV2 (line 594) | class GlowV2(nn.Module):
method __init__ (line 595) | def __init__(self,
method forward (line 648) | def forward(self, x=None, x_mask=None, g=None, reverse=False, concat_z...
method store_inverse (line 717) | def store_inverse(self):
method get_prior (line 730) | def get_prior(self, B, T, device, noise_scale=0.66):
function squeeze (line 742) | def squeeze(x, x_mask=None, n_sqz=2):
function unsqueeze (line 757) | def unsqueeze(x, x_mask=None, n_sqz=2):
FILE: NeuralSeq/modules/GenerSpeech/model/mixstyle.py
class MixStyle (line 5) | class MixStyle(nn.Module):
method __init__ (line 11) | def __init__(self, p=0.5, alpha=0.1, eps=1e-6, hidden_size=256):
method __repr__ (line 31) | def __repr__(self):
method set_activation_status (line 34) | def set_activation_status(self, status=True):
method forward (line 37) | def forward(self, x, spk_embed):
FILE: NeuralSeq/modules/GenerSpeech/model/prosody_util.py
class VQEmbeddingEMA (line 16) | class VQEmbeddingEMA(nn.Module):
method __init__ (line 17) | def __init__(self, n_embeddings, embedding_dim, commitment_cost=0.25, ...
method encode (line 33) | def encode(self, x):
method forward (line 47) | def forward(self, x):
class CrossAttenLayer (line 95) | class CrossAttenLayer(nn.Module):
method __init__ (line 96) | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
method forward (line 107) | def forward(self, src, local_emotion, emotion_key_padding_mask=None, f...
class ProsodyAligner (line 129) | class ProsodyAligner(nn.Module):
method __init__ (line 130) | def __init__(self, num_layers, guided_sigma=0.3, guided_layers=None, n...
method forward (line 138) | def forward(self, src, local_emotion, src_key_padding_mask=None, emoti...
function _make_guided_attention_mask (line 162) | def _make_guided_attention_mask(ilen, rilen, olen, rolen, sigma):
class LocalStyleAdaptor (line 172) | class LocalStyleAdaptor(nn.Module):
method __init__ (line 173) | def __init__(self, hidden_size, num_vq_codes=64, padding_idx=0):
method forward (line 182) | def forward(self, ref_mels, mel2ph=None, no_vq=False):
class LambdaLayer (line 204) | class LambdaLayer(nn.Module):
method __init__ (line 205) | def __init__(self, lambd):
method forward (line 209) | def forward(self, x):
class Conv1d (line 213) | class Conv1d(nn.Conv1d):
method __init__ (line 216) | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,...
method forward (line 221) | def forward(self, x):
function init_weights_func (line 225) | def init_weights_func(m):
class ResidualBlock (line 231) | class ResidualBlock(nn.Module):
method __init__ (line 234) | def __init__(self, channels, kernel_size, dilation, n=2, norm_type='bn...
method forward (line 264) | def forward(self, x):
class Pad (line 275) | class Pad(nn.ZeroPad2d):
method __init__ (line 276) | def __init__(self, kernel_size, dilation):
class ZeroTemporalPad (line 284) | class ZeroTemporalPad(nn.ZeroPad2d):
method __init__ (line 287) | def __init__(self, kernel_size, dilation, causal=False):
class ConvBlocks (line 298) | class ConvBlocks(nn.Module):
method __init__ (line 301) | def __init__(self, channels, out_dims, dilations, kernel_size,
method forward (line 324) | def forward(self, x):
class TextConvEncoder (line 338) | class TextConvEncoder(ConvBlocks):
method __init__ (line 339) | def __init__(self, embed_tokens, channels, out_dims, dilations, kernel...
method forward (line 348) | def forward(self, txt_tokens):
class ConditionalConvBlocks (line 360) | class ConditionalConvBlocks(ConvBlocks):
method __init__ (line 361) | def __init__(self, channels, g_channels, out_dims, dilations, kernel_s...
method forward (line 372) | def forward(self, x, g, x_mask):
FILE: NeuralSeq/modules/GenerSpeech/model/wavenet.py
function fused_add_tanh_sigmoid_multiply (line 5) | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
class WN (line 14) | class WN(torch.nn.Module):
method __init__ (line 15) | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_laye...
method forward (line 54) | def forward(self, x, x_mask=None, g=None, **kwargs):
method remove_weight_norm (line 80) | def remove_weight_norm(self):
FILE: NeuralSeq/modules/GenerSpeech/task/dataset.py
class GenerSpeech_dataset (line 30) | class GenerSpeech_dataset(BaseTTSDataset):
method __init__ (line 31) | def __init__(self, prefix, shuffle=False, test_items=None, test_sizes=...
method load_test_inputs (line 55) | def load_test_inputs(self, test_input_dir):
method _get_item (line 89) | def _get_item(self, index):
method __getitem__ (line 96) | def __getitem__(self, index):
method collater (line 146) | def collater(self, samples):
FILE: NeuralSeq/modules/GenerSpeech/task/generspeech.py
class GenerSpeechTask (line 25) | class GenerSpeechTask(FastSpeech2Task):
method __init__ (line 26) | def __init__(self):
method build_tts_model (line 30) | def build_tts_model(self):
method build_model (line 33) | def build_model(self):
method run_model (line 40) | def run_model(self, model, sample, return_output=False):
method validation_step (line 72) | def validation_step(self, sample, batch_idx):
method test_step (line 125) | def test_step(self, sample, batch_idx):
method after_infer (line 155) | def after_infer(self, predictions, sil_start_frame=0):
method save_result (line 232) | def save_result(wav_out, mel, base_fn, gen_dir, str_phs=None, mel2ph=N...
FILE: NeuralSeq/modules/commons/align_ops.py
function build_word_mask (line 5) | def build_word_mask(x2word, y2word):
function mel2ph_to_mel2word (line 9) | def mel2ph_to_mel2word(mel2ph, ph2word):
function clip_mel2token_to_multiple (line 15) | def clip_mel2token_to_multiple(mel2token, frames_multiple):
function expand_states (line 21) | def expand_states(h, mel2token):
FILE: NeuralSeq/modules/commons/common_layers.py
class Reshape (line 10) | class Reshape(nn.Module):
method __init__ (line 11) | def __init__(self, *args):
method forward (line 15) | def forward(self, x):
class Permute (line 19) | class Permute(nn.Module):
method __init__ (line 20) | def __init__(self, *args):
method forward (line 24) | def forward(self, x):
class LinearNorm (line 28) | class LinearNorm(torch.nn.Module):
method __init__ (line 29) | def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
method forward (line 37) | def forward(self, x):
class ConvNorm (line 41) | class ConvNorm(torch.nn.Module):
method __init__ (line 42) | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
method forward (line 57) | def forward(self, signal):
function Embedding (line 62) | def Embedding(num_embeddings, embedding_dim, padding_idx=None):
function LayerNorm (line 70) | def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, expor...
function Linear (line 79) | def Linear(in_features, out_features, bias=True):
class SinusoidalPositionalEmbedding (line 87) | class SinusoidalPositionalEmbedding(nn.Module):
method __init__ (line 93) | def __init__(self, embedding_dim, padding_idx, init_size=1024):
method get_embedding (line 105) | def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
method forward (line 123) | def forward(self, input, incremental_state=None, timestep=None, positi...
method max_positions (line 144) | def max_positions(self):
class ConvTBC (line 149) | class ConvTBC(nn.Module):
method __init__ (line 150) | def __init__(self, in_channels, out_channels, kernel_size, padding=0):
method forward (line 161) | def forward(self, input):
class MultiheadAttention (line 165) | class MultiheadAttention(nn.Module):
method __init__ (line 166) | def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout...
method reset_parameters (line 218) | def reset_parameters(self):
method forward (line 235) | def forward(
method in_proj_qkv (line 421) | def in_proj_qkv(self, query):
method in_proj_q (line 424) | def in_proj_q(self, query):
method in_proj_k (line 433) | def in_proj_k(self, key):
method in_proj_v (line 443) | def in_proj_v(self, value):
method _in_proj (line 453) | def _in_proj(self, input, start=0, end=None):
method apply_sparse_mask (line 462) | def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
class Swish (line 466) | class Swish(torch.autograd.Function):
method forward (line 468) | def forward(ctx, i):
method backward (line 474) | def backward(ctx, grad_output):
class CustomSwish (line 480) | class CustomSwish(nn.Module):
method forward (line 481) | def forward(self, input_tensor):
class TransformerFFNLayer (line 485) | class TransformerFFNLayer(nn.Module):
method __init__ (line 486) | def __init__(self, hidden_size, filter_size, padding="SAME", kernel_si...
method forward (line 502) | def forward(self, x, incremental_state=None):
class BatchNorm1dTBC (line 524) | class BatchNorm1dTBC(nn.Module):
method __init__ (line 525) | def __init__(self, c):
method forward (line 529) | def forward(self, x):
class EncSALayer (line 541) | class EncSALayer(nn.Module):
method __init__ (line 542) | def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
method forward (line 563) | def forward(self, x, encoder_padding_mask=None, **kwargs):
class DecSALayer (line 590) | class DecSALayer(nn.Module):
method __init__ (line 591) | def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_...
method forward (line 607) | def forward(
FILE: NeuralSeq/modules/commons/conv.py
class LambdaLayer (line 10) | class LambdaLayer(nn.Module):
method __init__ (line 11) | def __init__(self, lambd):
method forward (line 15) | def forward(self, x):
function init_weights_func (line 19) | def init_weights_func(m):
class ResidualBlock (line 25) | class ResidualBlock(nn.Module):
method __init__ (line 28) | def __init__(self, channels, kernel_size, dilation, n=2, norm_type='bn...
method forward (line 58) | def forward(self, x):
class ConvBlocks (line 69) | class ConvBlocks(nn.Module):
method __init__ (line 72) | def __init__(self, hidden_size, out_dims, dilations, kernel_size,
method forward (line 100) | def forward(self, x, nonpadding=None):
class TextConvEncoder (line 120) | class TextConvEncoder(ConvBlocks):
method __init__ (line 121) | def __init__(self, dict_size, hidden_size, out_dims, dilations, kernel...
method forward (line 131) | def forward(self, txt_tokens):
class ConditionalConvBlocks (line 143) | class ConditionalConvBlocks(ConvBlocks):
method __init__ (line 144) | def __init__(self, hidden_size, c_cond, c_out, dilations, kernel_size,
method forward (line 155) | def forward(self, x, cond, nonpadding=None):
FILE: NeuralSeq/modules/commons/espnet_positional_embedding.py
class PositionalEncoding (line 5) | class PositionalEncoding(torch.nn.Module):
method __init__ (line 14) | def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
method extend_pe (line 24) | def extend_pe(self, x):
method forward (line 47) | def forward(self, x: torch.Tensor):
class ScaledPositionalEncoding (line 59) | class ScaledPositionalEncoding(PositionalEncoding):
method __init__ (line 68) | def __init__(self, d_model, dropout_rate, max_len=5000):
method reset_parameters (line 73) | def reset_parameters(self):
method forward (line 77) | def forward(self, x):
class RelPositionalEncoding (line 89) | class RelPositionalEncoding(PositionalEncoding):
method __init__ (line 98) | def __init__(self, d_model, dropout_rate, max_len=5000):
method forward (line 102) | def forward(self, x):
FILE: NeuralSeq/modules/commons/normalizing_flow/glow_modules.py
class ActNorm (line 10) | class ActNorm(nn.Module):
method __init__ (line 11) | def __init__(self, channels, ddi=False, **kwargs):
method forward (line 19) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method store_inverse (line 35) | def store_inverse(self):
method set_ddi (line 38) | def set_ddi(self, ddi):
method initialize (line 41) | def initialize(self, x, x_mask):
class InvConvNear (line 56) | class InvConvNear(nn.Module):
method __init__ (line 57) | def __init__(self, channels, n_split=4, no_jacobian=False, lu=True, n_...
method forward (line 89) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method _get_weight (line 126) | def _get_weight(self):
method store_inverse (line 133) | def store_inverse(self):
class InvConv (line 138) | class InvConv(nn.Module):
method __init__ (line 139) | def __init__(self, channels, no_jacobian=False, lu=True, **kwargs):
method get_weight (line 167) | def get_weight(self, device, reverse):
method forward (line 184) | def forward(self, x, x_mask=None, reverse=False, **kwargs):
method store_inverse (line 210) | def store_inverse(self):
class CouplingBlock (line 214) | class CouplingBlock(nn.Module):
method __init__ (line 215) | def __init__(self, in_channels, hidden_channels, kernel_size, dilation...
method forward (line 241) | def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):
method store_inverse (line 264) | def store_inverse(self):
class Glow (line 268) | class Glow(nn.Module):
method __init__ (line 269) | def __init__(self,
method forward (line 327) | def forward(self, x, x_mask=None, g=None, reverse=False, return_hidden...
method store_inverse (line 353) | def store_inverse(self):
FILE: NeuralSeq/modules/commons/normalizing_flow/res_flow.py
class FlipLayer (line 7) | class FlipLayer(nn.Module):
method forward (line 8) | def forward(self, x, nonpadding, cond=None, reverse=False):
class CouplingLayer (line 13) | class CouplingLayer(nn.Module):
method __init__ (line 14) | def __init__(self, c_in, hidden_size, kernel_size, n_layers, p_dropout...
method forward (line 32) | def forward(self, x, nonpadding, cond=None, reverse=False):
class ResFlow (line 42) | class ResFlow(nn.Module):
method __init__ (line 43) | def __init__(self,
method forward (line 58) | def forward(self, x, nonpadding, cond=None, reverse=False):
FILE: NeuralSeq/modules/commons/normalizing_flow/utils.py
function squeeze (line 4) | def squeeze(x, x_mask=None, n_sqz=2):
function unsqueeze (line 19) | def unsqueeze(x, x_mask=None, n_sqz=2):
FILE: NeuralSeq/modules/commons/rel_transformer.py
function convert_pad_shape (line 11) | def convert_pad_shape(pad_shape):
function shift_1d (line 17) | def shift_1d(x):
function sequence_mask (line 22) | def sequence_mask(length, max_length=None):
class Encoder (line 29) | class Encoder(nn.Module):
method __init__ (line 30) | def __init__(self, hidden_channels, filter_channels, n_heads, n_layers...
method forward (line 59) | def forward(self, x, x_mask):
class MultiHeadAttention (line 86) | class MultiHeadAttention(nn.Module):
method __init__ (line 87) | def __init__(self, channels, out_channels, n_heads, window_size=None, ...
method forward (line 121) | def forward(self, x, c, attn_mask=None):
method attention (line 131) | def attention(self, query, key, value, mask=None):
method _matmul_with_relative_values (line 164) | def _matmul_with_relative_values(self, x, y):
method _matmul_with_relative_keys (line 173) | def _matmul_with_relative_keys(self, x, y):
method _get_relative_embeddings (line 182) | def _get_relative_embeddings(self, relative_embeddings, length):
method _relative_position_to_absolute_position (line 197) | def _relative_position_to_absolute_position(self, x):
method _absolute_position_to_relative_position (line 214) | def _absolute_position_to_relative_position(self, x):
method _attention_bias_proximal (line 228) | def _attention_bias_proximal(self, length):
class FFN (line 240) | class FFN(nn.Module):
method __init__ (line 241) | def __init__(self, in_channels, out_channels, filter_channels, kernel_...
method forward (line 254) | def forward(self, x, x_mask):
class LayerNorm (line 265) | class LayerNorm(nn.Module):
method __init__ (line 266) | def __init__(self, channels, eps=1e-4):
method forward (line 274) | def forward(self, x):
class ConvReluNorm (line 286) | class ConvReluNorm(nn.Module):
method __init__ (line 287) | def __init__(self, in_channels, hidden_channels, out_channels, kernel_...
method forward (line 311) | def forward(self, x, x_mask):
class RelTransformerEncoder (line 321) | class RelTransformerEncoder(nn.Module):
method __init__ (line 322) | def __init__(self,
method forward (line 368) | def forward(self, x, x_mask=None):
class Pooler (line 383) | class Pooler(nn.Module):
method __init__ (line 392) | def __init__(self, pooler_type):
method forward (line 397) | def forward(self, attention_mask, outputs):
class Similarity (line 420) | class Similarity(nn.Module):
method __init__ (line 425) | def __init__(self, temp):
method forward (line 433) | def forward(self, x, y):
class BertPredictionHeadTransform (line 451) | class BertPredictionHeadTransform(nn.Module):
method __init__ (line 452) | def __init__(self, hidden_size):
method forward (line 458) | def forward(self, hidden_states):
class BertLMPredictionHead (line 465) | class BertLMPredictionHead(nn.Module):
method __init__ (line 466) | def __init__(self, hid_dim, out_dim):
method forward (line 473) | def forward(self, hidden_states):
class BERTRelTransformerEncoder (line 483) | class BERTRelTransformerEncoder(nn.Module):
method __init__ (line 484) | def __init__(self,
method forward (line 582) | def forward(self, x, x_mask=None, bert_feats=None, ph2word=None, **kwa...
FILE: NeuralSeq/modules/commons/ssim.py
function gaussian (line 319) | def gaussian(window_size, sigma):
function create_window (line 324) | def create_window(window_size, channel):
function _ssim (line 331) | def _ssim(img1, img2, window, window_size, channel, size_average=True):
class SSIM (line 354) | class SSIM(torch.nn.Module):
method __init__ (line 355) | def __init__(self, window_size=11, size_average=True):
method forward (line 362) | def forward(self, img1, img2):
function ssim (line 383) | def ssim(img1, img2, window_size=11, size_average=True):
FILE: NeuralSeq/modules/commons/transformer.py
class SinusoidalPositionalEmbedding (line 13) | class SinusoidalPositionalEmbedding(nn.Module):
method __init__ (line 19) | def __init__(self, embedding_dim, padding_idx, init_size=1024):
method get_embedding (line 31) | def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
method forward (line 49) | def forward(self, input, incremental_state=None, timestep=None, positi...
method max_positions (line 70) | def max_positions(self):
class TransformerFFNLayer (line 75) | class TransformerFFNLayer(nn.Module):
method __init__ (line 76) | def __init__(self, hidden_size, filter_size, padding="SAME", kernel_si...
method forward (line 90) | def forward(self, x, incremental_state=None):
method _get_input_buffer (line 114) | def _get_input_buffer(self, incremental_state):
method _set_input_buffer (line 121) | def _set_input_buffer(self, incremental_state, buffer):
method clear_buffer (line 129) | def clear_buffer(self, incremental_state):
class MultiheadAttention (line 137) | class MultiheadAttention(nn.Module):
method __init__ (line 138) | def __init__(self, embed_dim, num_heads, kdim=None, vdim=None, dropout...
method reset_parameters (line 190) | def reset_parameters(self):
method forward (line 207) | def forward(
method in_proj_qkv (line 420) | def in_proj_qkv(self, query):
method in_proj_q (line 423) | def in_proj_q(self, query):
method in_proj_k (line 432) | def in_proj_k(self, key):
method in_proj_v (line 442) | def in_proj_v(self, value):
method _in_proj (line 452) | def _in_proj(self, input, start=0, end=None):
method _get_input_buffer (line 460) | def _get_input_buffer(self, incremental_state):
method _set_input_buffer (line 467) | def _set_input_buffer(self, incremental_state, buffer):
method apply_sparse_mask (line 475) | def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
method clear_buffer (line 478) | def clear_buffer(self, incremental_state=None):
class EncSALayer (line 488) | class EncSALayer(nn.Module):
method __init__ (line 489) | def __init__(self, c, num_heads, dropout, attention_dropout=0.1,
method forward (line 503) | def forward(self, x, encoder_padding_mask=None, **kwargs):
class DecSALayer (line 530) | class DecSALayer(nn.Module):
method __init__ (line 531) | def __init__(self, c, num_heads, dropout, attention_dropout=0.1, relu_...
method forward (line 548) | def forward(
method clear_buffer (line 608) | def clear_buffer(self, input, encoder_out=None, encoder_padding_mask=N...
method set_buffer (line 612) | def set_buffer(self, name, tensor, incremental_state):
class TransformerEncoderLayer (line 616) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 617) | def __init__(self, hidden_size, dropout, kernel_size=9, num_heads=2):
method forward (line 627) | def forward(self, x, **kwargs):
class TransformerDecoderLayer (line 631) | class TransformerDecoderLayer(nn.Module):
method __init__ (line 632) | def __init__(self, hidden_size, dropout, kernel_size=9, num_heads=2):
method forward (line 642) | def forward(self, x, **kwargs):
method clear_buffer (line 645) | def clear_buffer(self, *args):
method set_buffer (line 648) | def set_buffer(self, *args):
class FFTBlocks (line 652) | class FFTBlocks(nn.Module):
method __init__ (line 653) | def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout...
method forward (line 681) | def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens...
class FastSpeechEncoder (line 709) | class FastSpeechEncoder(FFTBlocks):
method __init__ (line 710) | def __init__(self, dict_size, hidden_size=256, num_layers=4, kernel_si...
method forward (line 721) | def forward(self, txt_tokens, attn_mask=None):
method forward_embedding (line 735) | def forward_embedding(self, txt_tokens):
class FastSpeechDecoder (line 745) | class FastSpeechDecoder(FFTBlocks):
method __init__ (line 746) | def __init__(self, hidden_size=256, num_layers=4, kernel_size=9, num_h...
FILE: NeuralSeq/modules/commons/wavenet.py
function fused_add_tanh_sigmoid_multiply (line 5) | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
class WN (line 14) | class WN(torch.nn.Module):
method __init__ (line 15) | def __init__(self, hidden_size, kernel_size, dilation_rate, n_layers, ...
method forward (line 55) | def forward(self, x, nonpadding=None, cond=None):
method remove_weight_norm (line 90) | def remove_weight_norm(self):
FILE: NeuralSeq/modules/diff/candidate_decoder.py
class SinusoidalPosEmb (line 14) | class SinusoidalPosEmb(nn.Module):
method __init__ (line 15) | def __init__(self, dim):
method forward (line 19) | def forward(self, x):
function Conv1d (line 29) | def Conv1d(*args, **kwargs):
class FFT (line 35) | class FFT(FastspeechDecoder):
method __init__ (line 36) | def __init__(self, hidden_size=None, num_layers=None, kernel_size=None...
method forward (line 50) | def forward(self, spec, diffusion_step, cond, padding_mask=None, attn_...
FILE: NeuralSeq/modules/diff/diffusion.py
function exists (line 19) | def exists(x):
function default (line 23) | def default(val, d):
function cycle (line 29) | def cycle(dl):
function num_to_groups (line 35) | def num_to_groups(num, divisor):
class Residual (line 44) | class Residual(nn.Module):
method __init__ (line 45) | def __init__(self, fn):
method forward (line 49) | def forward(self, x, *args, **kwargs):
class SinusoidalPosEmb (line 53) | class SinusoidalPosEmb(nn.Module):
method __init__ (line 54) | def __init__(self, dim):
method forward (line 58) | def forward(self, x):
class Mish (line 68) | class Mish(nn.Module):
method forward (line 69) | def forward(self, x):
class Upsample (line 73) | class Upsample(nn.Module):
method __init__ (line 74) | def __init__(self, dim):
method forward (line 78) | def forward(self, x):
class Downsample (line 82) | class Downsample(nn.Module):
method __init__ (line 83) | def __init__(self, dim):
method forward (line 87) | def forward(self, x):
class Rezero (line 91) | class Rezero(nn.Module):
method __init__ (line 92) | def __init__(self, fn):
method forward (line 97) | def forward(self, x):
class Block (line 103) | class Block(nn.Module):
method __init__ (line 104) | def __init__(self, dim, dim_out, groups=8):
method forward (line 112) | def forward(self, x):
class ResnetBlock (line 116) | class ResnetBlock(nn.Module):
method __init__ (line 117) | def __init__(self, dim, dim_out, *, time_emb_dim, groups=8):
method forward (line 128) | def forward(self, x, time_emb):
class LinearAttention (line 135) | class LinearAttention(nn.Module):
method __init__ (line 136) | def __init__(self, dim, heads=4, dim_head=32):
method forward (line 143) | def forward(self, x):
function extract (line 156) | def extract(a, t, x_shape):
function noise_like (line 162) | def noise_like(shape, device, repeat=False):
function cosine_beta_schedule (line 168) | def cosine_beta_schedule(timesteps, s=0.008):
class GaussianDiffusion (line 181) | class GaussianDiffusion(nn.Module):
method __init__ (line 182) | def __init__(self, phone_encoder, out_dims, denoise_fn,
method q_mean_variance (line 233) | def q_mean_variance(self, x_start, t):
method predict_start_from_noise (line 239) | def predict_start_from_noise(self, x_t, t, noise):
method q_posterior (line 245) | def q_posterior(self, x_start, x_t, t):
method p_mean_variance (line 254) | def p_mean_variance(self, x, t, cond, clip_denoised: bool):
method p_sample (line 265) | def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
method q_sample (line 273) | def q_sample(self, x_start, t, noise=None):
method p_losses (line 280) | def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
method forward (line 300) | def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
method norm_spec (line 324) | def norm_spec(self, x):
method denorm_spec (line 327) | def denorm_spec(self, x):
method cwt2f0_norm (line 330) | def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
method out2mel (line 333) | def out2mel(self, x):
FILE: NeuralSeq/modules/diff/net.py
class AttrDict (line 16) | class AttrDict(dict):
method __init__ (line 17) | def __init__(self, *args, **kwargs):
method override (line 21) | def override(self, attrs):
class SinusoidalPosEmb (line 32) | class SinusoidalPosEmb(nn.Module):
method __init__ (line 33) | def __init__(self, dim):
method forward (line 37) | def forward(self, x):
function Conv1d (line 47) | def Conv1d(*args, **kwargs):
function silu (line 54) | def silu(x):
class ResidualBlock (line 58) | class ResidualBlock(nn.Module):
method __init__ (line 59) | def __init__(self, encoder_hidden, residual_channels, dilation):
method forward (line 66) | def forward(self, x, conditioner, diffusion_step):
class DiffNet (line 81) | class DiffNet(nn.Module):
method __init__ (line 82) | def __init__(self, in_dims=80):
method forward (line 107) | def forward(self, spec, diffusion_step, cond):
FILE: NeuralSeq/modules/diff/shallow_diffusion_tts.py
function exists (line 20) | def exists(x):
function default (line 24) | def default(val, d):
function extract (line 32) | def extract(a, t, x_shape):
function noise_like (line 38) | def noise_like(shape, device, repeat=False):
function linear_beta_schedule (line 44) | def linear_beta_schedule(timesteps, max_beta=hparams.get('max_beta', 0.0...
function cosine_beta_schedule (line 52) | def cosine_beta_schedule(timesteps, s=0.008):
class GaussianDiffusion (line 71) | class GaussianDiffusion(nn.Module):
method __init__ (line 72) | def __init__(self, phone_encoder, out_dims, denoise_fn,
method q_mean_variance (line 128) | def q_mean_variance(self, x_start, t):
method predict_start_from_noise (line 134) | def predict_start_from_noise(self, x_t, t, noise):
method q_posterior (line 140) | def q_posterior(self, x_start, x_t, t):
method p_mean_variance (line 149) | def p_mean_variance(self, x, t, cond, clip_denoised: bool):
method p_sample (line 160) | def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
method p_sample_plms (line 169) | def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repe...
method q_sample (line 206) | def q_sample(self, x_start, t, noise=None):
method p_losses (line 213) | def p_losses(self, x_start, t, cond, noise=None, nonpadding=None):
method forward (line 233) | def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
method norm_spec (line 279) | def norm_spec(self, x):
method denorm_spec (line 282) | def denorm_spec(self, x):
method cwt2f0_norm (line 285) | def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
method out2mel (line 288) | def out2mel(self, x):
class OfflineGaussianDiffusion (line 292) | class OfflineGaussianDiffusion(GaussianDiffusion):
method forward (line 293) | def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
FILE: NeuralSeq/modules/diffsinger_midi/fs2.py
class FastspeechMIDIEncoder (line 11) | class FastspeechMIDIEncoder(FastspeechEncoder):
method forward_embedding (line 12) | def forward_embedding(self, txt_tokens, midi_embedding, midi_dur_embed...
method forward (line 25) | def forward(self, txt_tokens, midi_embedding, midi_dur_embedding, slur...
class FastSpeech2MIDI (line 46) | class FastSpeech2MIDI(FastSpeech2):
method __init__ (line 47) | def __init__(self, dictionary, out_dims=None):
method forward (line 55) | def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
FILE: NeuralSeq/modules/fastspeech/fs2.py
class FastSpeech2 (line 22) | class FastSpeech2(nn.Module):
method __init__ (line 23) | def __init__(self, dictionary, out_dims=None):
method build_embedding (line 74) | def build_embedding(self, dictionary, embed_dim):
method forward (line 79) | def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
method add_dur (line 140) | def add_dur(self, dur_input, mel2ph, txt_tokens, ret):
method add_energy (line 165) | def add_energy(self, decoder_inp, energy, ret):
method add_pitch (line 174) | def add_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
method run_decoder (line 222) | def run_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
method cwt2f0_norm (line 228) | def cwt2f0_norm(self, cwt_spec, mean, std, mel2ph):
method out2mel (line 235) | def out2mel(self, out):
method mel_norm (line 239) | def mel_norm(x):
method mel_denorm (line 243) | def mel_denorm(x):
method expand_states (line 246) | def expand_states(self, h, mel2ph):
FILE: NeuralSeq/modules/fastspeech/pe.py
class Prenet (line 7) | class Prenet(nn.Module):
method __init__ (line 8) | def __init__(self, in_dim=80, out_dim=256, kernel=5, n_layers=3, strid...
method forward (line 23) | def forward(self, x):
class ConvBlock (line 44) | class ConvBlock(nn.Module):
method __init__ (line 45) | def __init__(self, idim=80, n_chans=256, kernel_size=3, stride=1, norm...
method forward (line 62) | def forward(self, x):
class ConvStacks (line 81) | class ConvStacks(nn.Module):
method __init__ (line 82) | def __init__(self, idim=80, n_layers=5, n_chans=256, odim=32, kernel_s...
method forward (line 98) | def forward(self, x, return_hiddens=False):
class PitchExtractor (line 119) | class PitchExtractor(nn.Module):
method __init__ (line 120) | def __init__(self, n_mel_bins=80, conv_layers=2):
method forward (line 135) | def forward(self, mel_input=None):
FILE: NeuralSeq/modules/fastspeech/tts_modules.py
class TransformerEncoderLayer (line 16) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 17) | def __init__(self, hidden_size, dropout, kernel_size=None, num_heads=2...
method forward (line 30) | def forward(self, x, **kwargs):
class LayerNorm (line 37) | class LayerNorm(torch.nn.LayerNorm):
method __init__ (line 43) | def __init__(self, nout, dim=-1, eps=1e-5):
method forward (line 48) | def forward(self, x):
class DurationPredictor (line 59) | class DurationPredictor(torch.nn.Module):
method __init__ (line 70) | def __init__(self, idim, odims = 1, n_layers=2, n_chans=384, kernel_si...
method _forward (line 98) | def _forward(self, xs, x_masks=None, is_inference=False):
method out2dur (line 114) | def out2dur(self, xs):
method forward (line 125) | def forward(self, xs, x_masks=None):
method inference (line 135) | def inference(self, xs, x_masks=None):
class SyntaDurationPredictor (line 145) | class SyntaDurationPredictor(torch.nn.Module):
method __init__ (line 146) | def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropo...
method forward (line 163) | def forward(self, x, x_padding=None, ph2word=None, graph_lst=None, ety...
class LengthRegulator (line 179) | class LengthRegulator(torch.nn.Module):
method __init__ (line 180) | def __init__(self, pad_value=0.0):
method forward (line 184) | def forward(self, dur, dur_padding=None, alpha=1.0):
class PitchPredictor (line 217) | class PitchPredictor(torch.nn.Module):
method __init__ (line 218) | def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
method forward (line 247) | def forward(self, xs):
class EnergyPredictor (line 263) | class EnergyPredictor(PitchPredictor):
function mel2ph_to_dur (line 267) | def mel2ph_to_dur(mel2ph, T_txt, max_dur=None):
class FFTBlocks (line 276) | class FFTBlocks(nn.Module):
method __init__ (line 277) | def __init__(self, hidden_size, num_layers, ffn_kernel_size=9, dropout...
method forward (line 307) | def forward(self, x, padding_mask=None, attn_mask=None, return_hiddens...
class FastspeechEncoder (line 335) | class FastspeechEncoder(FFTBlocks):
method __init__ (line 336) | def __init__(self, embed_tokens, hidden_size=None, num_layers=None, ke...
method forward (line 352) | def forward(self, txt_tokens):
method forward_embedding (line 365) | def forward_embedding(self, txt_tokens):
class FastspeechDecoder (line 378) | class FastspeechDecoder(FFTBlocks):
method __init__ (line 379) | def __init__(self, hidden_size=None, num_layers=None, kernel_size=None...
FILE: NeuralSeq/modules/hifigan/hifigan.py
function init_weights (line 14) | def init_weights(m, mean=0.0, std=0.01):
function apply_weight_norm (line 20) | def apply_weight_norm(m):
function get_padding (line 26) | def get_padding(kernel_size, dilation=1):
class ResBlock1 (line 30) | class ResBlock1(torch.nn.Module):
method __init__ (line 31) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
method forward (line 54) | def forward(self, x):
method remove_weight_norm (line 63) | def remove_weight_norm(self):
class ResBlock2 (line 70) | class ResBlock2(torch.nn.Module):
method __init__ (line 71) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
method forward (line 82) | def forward(self, x):
method remove_weight_norm (line 89) | def remove_weight_norm(self):
class Conv1d1x1 (line 94) | class Conv1d1x1(Conv1d):
method __init__ (line 97) | def __init__(self, in_channels, out_channels, bias):
class HifiGanGenerator (line 104) | class HifiGanGenerator(torch.nn.Module):
method __init__ (line 105) | def __init__(self, h, c_out=1):
method forward (line 144) | def forward(self, x, f0=None):
method remove_weight_norm (line 171) | def remove_weight_norm(self):
class DiscriminatorP (line 181) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 182) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
method forward (line 202) | def forward(self, x, mel):
class MultiPeriodDiscriminator (line 226) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 227) | def __init__(self, use_cond=False, c_in=1):
method forward (line 237) | def forward(self, y, y_hat, mel=None):
class DiscriminatorS (line 253) | class DiscriminatorS(torch.nn.Module):
method __init__ (line 254) | def __init__(self, use_spectral_norm=False, use_cond=False, upsample_r...
method forward (line 273) | def forward(self, x, mel):
class MultiScaleDiscriminator (line 289) | class MultiScaleDiscriminator(torch.nn.Module):
method __init__ (line 290) | def __init__(self, use_cond=False, c_in=1):
method forward (line 309) | def forward(self, y, y_hat, mel=None):
function feature_loss (line 328) | def feature_loss(fmap_r, fmap_g):
function discriminator_loss (line 337) | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
function cond_discriminator_loss (line 350) | def cond_discriminator_loss(outputs):
function generator_loss (line 359) | def generator_loss(disc_outputs):
FILE: NeuralSeq/modules/hifigan/mel_utils.py
function load_wav (line 10) | def load_wav(full_path):
function dynamic_range_compression (line 15) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
function dynamic_range_decompression (line 19) | def dynamic_range_decompression(x, C=1):
function dynamic_range_compression_torch (line 23) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
function dynamic_range_decompression_torch (line 27) | def dynamic_range_decompression_torch(x, C=1):
function spectral_normalize_torch (line 31) | def spectral_normalize_torch(magnitudes):
function spectral_de_normalize_torch (line 36) | def spectral_de_normalize_torch(magnitudes):
function mel_spectrogram (line 45) | def mel_spectrogram(y, hparams, center=False, complex=False):
FILE: NeuralSeq/modules/parallel_wavegan/layers/causal_conv.py
class CausalConv1d (line 12) | class CausalConv1d(torch.nn.Module):
method __init__ (line 15) | def __init__(self, in_channels, out_channels, kernel_size,
method forward (line 23) | def forward(self, x):
class CausalConvTranspose1d (line 36) | class CausalConvTranspose1d(torch.nn.Module):
method __init__ (line 39) | def __init__(self, in_channels, out_channels, kernel_size, stride, bia...
method forward (line 46) | def forward(self, x):
FILE: NeuralSeq/modules/parallel_wavegan/layers/pqmf.py
function design_prototype_filter (line 15) | def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
class PQMF (line 51) | class PQMF(torch.nn.Module):
method __init__ (line 61) | def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
method analysis (line 105) | def analysis(self, x):
method synthesis (line 118) | def synthesis(self, x):
FILE: NeuralSeq/modules/parallel_wavegan/layers/residual_block.py
class Conv1d (line 15) | class Conv1d(torch.nn.Conv1d):
method __init__ (line 18) | def __init__(self, *args, **kwargs):
method reset_parameters (line 22) | def reset_parameters(self):
class Conv1d1x1 (line 29) | class Conv1d1x1(Conv1d):
method __init__ (line 32) | def __init__(self, in_channels, out_channels, bias):
class ResidualBlock (line 39) | class ResidualBlock(torch.nn.Module):
method __init__ (line 42) | def __init__(self,
method forward (line 91) | def forward(self, x, c):
FILE: NeuralSeq/modules/parallel_wavegan/layers/residual_stack.py
class ResidualStack (line 13) | class ResidualStack(torch.nn.Module):
method __init__ (line 16) | def __init__(self,
method forward (line 65) | def forward(self, c):
FILE: NeuralSeq/modules/parallel_wavegan/layers/tf_layers.py
class TFReflectionPad1d (line 11) | class TFReflectionPad1d(tf.keras.layers.Layer):
method __init__ (line 14) | def __init__(self, padding_size):
method call (line 25) | def call(self, x):
class TFConvTranspose1d (line 38) | class TFConvTranspose1d(tf.keras.layers.Layer):
method __init__ (line 41) | def __init__(self, channels, kernel_size, stride, padding):
method call (line 60) | def call(self, x):
class TFResidualStack (line 74) | class TFResidualStack(tf.keras.layers.Layer):
method __init__ (line 77) | def __init__(self,
method call (line 115) | def call(self, x):
FILE: NeuralSeq/modules/parallel_wavegan/layers/upsample.py
class Stretch2d (line 16) | class Stretch2d(torch.nn.Module):
method __init__ (line 19) | def __init__(self, x_scale, y_scale, mode="nearest"):
method forward (line 33) | def forward(self, x):
class Conv2d (line 47) | class Conv2d(torch.nn.Conv2d):
method __init__ (line 50) | def __init__(self, *args, **kwargs):
method reset_parameters (line 54) | def reset_parameters(self):
class UpsampleNetwork (line 61) | class UpsampleNetwork(torch.nn.Module):
method __init__ (line 64) | def __init__(self,
method forward (line 106) | def forward(self, c):
class ConvInUpsampleNetwork (line 125) | class ConvInUpsampleNetwork(torch.nn.Module):
method __init__ (line 128) | def __init__(self,
method forward (line 167) | def forward(self, c):
FILE: NeuralSeq/modules/parallel_wavegan/losses/stft_loss.py
function stft (line 12) | def stft(x, fft_size, hop_size, win_length, window):
class SpectralConvergengeLoss (line 34) | class SpectralConvergengeLoss(torch.nn.Module):
method __init__ (line 37) | def __init__(self):
method forward (line 41) | def forward(self, x_mag, y_mag):
class LogSTFTMagnitudeLoss (line 55) | class LogSTFTMagnitudeLoss(torch.nn.Module):
method __init__ (line 58) | def __init__(self):
method forward (line 62) | def forward(self, x_mag, y_mag):
class STFTLoss (line 76) | class STFTLoss(torch.nn.Module):
method __init__ (line 79) | def __init__(self, fft_size=1024, shift_size=120, win_length=600, wind...
method forward (line 89) | def forward(self, x, y):
class MultiResolutionSTFTLoss (line 109) | class MultiResolutionSTFTLoss(torch.nn.Module):
method __init__ (line 112) | def __init__(self,
method forward (line 132) | def forward(self, x, y):
FILE: NeuralSeq/modules/parallel_wavegan/models/melgan.py
class MelGANGenerator (line 18) | class MelGANGenerator(torch.nn.Module):
method __init__ (line 21) | def __init__(self,
method forward (line 147) | def forward(self, c):
method remove_weight_norm (line 159) | def remove_weight_norm(self):
method apply_weight_norm (line 170) | def apply_weight_norm(self):
method reset_parameters (line 179) | def reset_parameters(self):
class MelGANDiscriminator (line 194) | class MelGANDiscriminator(torch.nn.Module):
method __init__ (line 197) | def __init__(self,
method forward (line 285) | def forward(self, x):
class MelGANMultiScaleDiscriminator (line 303) | class MelGANMultiScaleDiscriminator(torch.nn.Module):
method __init__ (line 306) | def __init__(self,
method forward (line 378) | def forward(self, x):
method remove_weight_norm (line 395) | def remove_weight_norm(self):
method apply_weight_norm (line 406) | def apply_weight_norm(self):
method reset_parameters (line 415) | def reset_parameters(self):
FILE: NeuralSeq/modules/parallel_wavegan/models/parallel_wavegan.py
class ParallelWaveGANGenerator (line 21) | class ParallelWaveGANGenerator(torch.nn.Module):
method __init__ (line 24) | def __init__(self,
method forward (line 138) | def forward(self, x, c=None, pitch=None, **kwargs):
method remove_weight_norm (line 173) | def remove_weight_norm(self):
method apply_weight_norm (line 184) | def apply_weight_norm(self):
method _get_receptive_field_size (line 194) | def _get_receptive_field_size(layers, stacks, kernel_size,
method receptive_field_size (line 202) | def receptive_field_size(self):
class ParallelWaveGANDiscriminator (line 207) | class ParallelWaveGANDiscriminator(torch.nn.Module):
method __init__ (line 210) | def __init__(self,
method forward (line 268) | def forward(self, x):
method apply_weight_norm (line 282) | def apply_weight_norm(self):
method remove_weight_norm (line 291) | def remove_weight_norm(self):
class ResidualParallelWaveGANDiscriminator (line 303) | class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
method __init__ (line 306) | def __init__(self,
method forward (line 392) | def forward(self, x):
method apply_weight_norm (line 416) | def apply_weight_norm(self):
method remove_weight_norm (line 425) | def remove_weight_norm(self):
FILE: NeuralSeq/modules/parallel_wavegan/models/source.py
class SineGen (line 7) | class SineGen(torch.nn.Module):
method __init__ (line 25) | def __init__(self, samp_rate, harmonic_num=0,
method _f02uv (line 38) | def _f02uv(self, f0):
method _f02sine (line 44) | def _f02sine(self, f0_values):
method forward (line 104) | def forward(self, f0):
method __init__ (line 329) | def __init__(self, samp_rate, harmonic_num=0,
method _f02uv (line 342) | def _f02uv(self, f0):
method _f02sine (line 348) | def _f02sine(self, f0_values):
method forward (line 408) | def forward(self, f0):
class PulseGen (line 140) | class PulseGen(torch.nn.Module):
method __init__ (line 146) | def __init__(self, samp_rate, pulse_amp = 0.1,
method forward (line 158) | def forward(self, f0):
class SignalsConv1d (line 205) | class SignalsConv1d(torch.nn.Module):
method __init__ (line 213) | def __init__(self):
method forward (line 216) | def forward(self, signal, system_ir):
class CyclicNoiseGen_v1 (line 246) | class CyclicNoiseGen_v1(torch.nn.Module):
method __init__ (line 252) | def __init__(self, samp_rate,
method noise_decay (line 264) | def noise_decay(self, beta, f0mean):
method forward (line 288) | def forward(self, f0s, beta):
class SineGen (line 311) | class SineGen(torch.nn.Module):
method __init__ (line 25) | def __init__(self, samp_rate, harmonic_num=0,
method _f02uv (line 38) | def _f02uv(self, f0):
method _f02sine (line 44) | def _f02sine(self, f0_values):
method forward (line 104) | def forward(self, f0):
method __init__ (line 329) | def __init__(self, samp_rate, harmonic_num=0,
method _f02uv (line 342) | def _f02uv(self, f0):
method _f02sine (line 348) | def _f02sine(self, f0_values):
method forward (line 408) | def forward(self, f0):
class SourceModuleCycNoise_v1 (line 444) | class SourceModuleCycNoise_v1(torch.nn.Module):
method __init__ (line 460) | def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
method forward (line 467) | def forward(self, f0_upsamped, beta):
class SourceModuleHnNSF (line 484) | class SourceModuleHnNSF(torch.nn.Module):
method __init__ (line 503) | def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
method forward (line 518) | def forward(self, x):
FILE: NeuralSeq/modules/parallel_wavegan/optimizers/radam.py
class RAdam (line 14) | class RAdam(Optimizer):
method __init__ (line 17) | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weig...
method __setstate__ (line 23) | def __setstate__(self, state):
method step (line 27) | def step(self, closure=None):
FILE: NeuralSeq/modules/parallel_wavegan/stft_loss.py
class STFTLoss (line 13) | class STFTLoss(torch.nn.Module):
method __init__ (line 16) | def __init__(self, fft_size=1024, shift_size=120, win_length=600, wind...
method forward (line 29) | def forward(self, x, y):
class MultiResolutionSTFTLoss (line 55) | class MultiResolutionSTFTLoss(torch.nn.Module):
method __init__ (line 58) | def __init__(self,
method forward (line 79) | def forward(self, x, y):
FILE: NeuralSeq/modules/parallel_wavegan/utils/utils.py
function find_files (line 17) | def find_files(root_dir, query="*.wav", include_root_dir=True):
function read_hdf5 (line 39) | def read_hdf5(hdf5_name, hdf5_path):
function write_hdf5 (line 66) | def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
class HDF5ScpLoader (line 109) | class HDF5ScpLoader(object):
method __init__ (line 131) | def __init__(self, feats_scp, default_hdf5_path="feats"):
method get_path (line 147) | def get_path(self, key):
method __getitem__ (line 151) | def __getitem__(self, key):
method __len__ (line 159) | def __len__(self):
method __iter__ (line 163) | def __iter__(self):
method keys (line 167) | def keys(self):
FILE: NeuralSeq/modules/syntaspeech/multi_window_disc.py
class SingleWindowDisc (line 6) | class SingleWindowDisc(nn.Module):
method __init__ (line 7) | def __init__(self, time_length, freq_length=80, kernel=(3, 3), c_in=1,...
method forward (line 32) | def forward(self, x):
class MultiWindowDiscriminator (line 46) | class MultiWindowDiscriminator(nn.Module):
method __init__ (line 47) | def __init__(self, time_lengths, freq_length=80, kernel=(3, 3), c_in=1...
method forward (line 55) | def forward(self, x, x_len, start_frames_wins=None):
method clip (line 81) | def clip(self, x, x_len, win_length, start_frames=None):
class Discriminator (line 107) | class Discriminator(nn.Module):
method __init__ (line 108) | def __init__(self, time_lengths=[32, 64, 128], freq_length=80, kernel=...
method forward (line 120) | def forward(self, x, start_frames_wins=None):
FILE: NeuralSeq/modules/syntaspeech/syntactic_graph_buider.py
class Sentence2GraphParser (line 7) | class Sentence2GraphParser:
method __init__ (line 8) | def __init__(self, language='zh', use_gpu=False, download=False):
method parse (line 15) | def parse(self, clean_sentence=None, words=None, ph_words=None):
method _parse_zh (line 26) | def _parse_zh(self, words, ph_words, enable_backward_edge=True, enable...
method _parse_en (line 180) | def _parse_en(self, clean_sentence, enable_backward_edge=True, enable_...
function plot_dgl_sentence_graph (line 267) | def plot_dgl_sentence_graph(dgl_graph, labels):
FILE: NeuralSeq/modules/syntaspeech/syntactic_graph_encoder.py
function sequence_mask (line 8) | def sequence_mask(lengths, maxlen, dtype=torch.bool):
function group_hidden_by_segs (line 16) | def group_hidden_by_segs(h, seg_ids, max_len):
class GraphAuxEnc (line 32) | class GraphAuxEnc(nn.Module):
method __init__ (line 33) | def __init__(self, in_dim, hid_dim, out_dim, n_iterations=5, n_edge_ty...
method ph_encoding_to_word_encoding (line 48) | def ph_encoding_to_word_encoding(ph_encoding, ph2word, word_len):
method pad_word_encoding_to_phoneme (line 61) | def pad_word_encoding_to_phoneme(self, word_encoding, ph2word, t_p):
method _process_ph_to_word_encoding (line 65) | def _process_ph_to_word_encoding(ph_encoding, ph2word, word_len=None):
method _postprocess_word2ph (line 85) | def _postprocess_word2ph(word_encoding, ph2word, t_p):
method _repeat_one_sequence (line 92) | def _repeat_one_sequence(x, d, T):
method word_forward (line 103) | def word_forward(self, graph_lst, word_encoding, etypes_lst):
method forward (line 135) | def forward(self, graph_lst, ph_encoding, ph2word, etypes_lst, return_...
FILE: NeuralSeq/modules/syntaspeech/syntaspeech.py
class SinusoidalPosEmb (line 17) | class SinusoidalPosEmb(nn.Module):
method __init__ (line 18) | def __init__(self, dim):
method forward (line 21) | def forward(self, x):
class SyntaSpeech (line 36) | class SyntaSpeech(FastSpeech):
method __init__ (line 37) | def __init__(self, ph_dict_size, word_dict_size, out_dims=None):
method build_embedding (line 117) | def build_embedding(self, dictionary, embed_dim):
method forward (line 122) | def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word...
method run_text_encoder (line 149) | def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len,...
method attention (line 194) | def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos...
method run_decoder (line 211) | def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, gl...
method forward_dur (line 234) | def forward_dur(self, dur_input, mel2word, ret, **kwargs):
method get_pos_embed (line 259) | def get_pos_embed(self, word2word, x2word):
method store_inverse_all (line 265) | def store_inverse_all(self):
FILE: NeuralSeq/tasks/base_task.py
class BaseDataset (line 30) | class BaseDataset(torch.utils.data.Dataset):
method __init__ (line 31) | def __init__(self, shuffle):
method _sizes (line 39) | def _sizes(self):
method __getitem__ (line 42) | def __getitem__(self, index):
method collater (line 45) | def collater(self, samples):
method __len__ (line 48) | def __len__(self):
method num_tokens (line 51) | def num_tokens(self, index):
method size (line 54) | def size(self, index):
method ordered_indices (line 60) | def ordered_indices(self):
method num_workers (line 73) | def num_workers(self):
class BaseTask (line 77) | class BaseTask(nn.Module):
method __init__ (line 78) | def __init__(self, *args, **kwargs):
method build_model (line 106) | def build_model(self):
method load_ckpt (line 109) | def load_ckpt(self, ckpt_base_dir, current_model_name=None, model_name...
method on_epoch_start (line 115) | def on_epoch_start(self):
method _training_step (line 118) | def _training_step(self, sample, batch_idx, optimizer_idx):
method training_step (line 127) | def training_step(self, sample, batch_idx, optimizer_idx=-1):
method optimizer_step (line 157) | def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx):
method on_epoch_end (line 163) | def on_epoch_end(self):
method validation_step (line 169) | def validation_step(self, sample, batch_idx):
method _validation_end (line 178) | def _validation_end(self, outputs):
method validation_end (line 186) | def validation_end(self, outputs):
method build_scheduler (line 196) | def build_scheduler(self, optimizer):
method build_optimizer (line 199) | def build_optimizer(self, model):
method configure_optimizers (line 202) | def configure_optimizers(self):
method test_start (line 207) | def test_start(self):
method test_step (line 210) | def test_step(self, sample, batch_idx):
method test_end (line 213) | def test_end(self, outputs):
method start (line 221) | def start(cls):
method configure_ddp (line 261) | def configure_ddp(self, model, device_ids):
method training_end (line 274) | def training_end(self, *args, **kwargs):
method init_ddp_connection (line 277) | def init_ddp_connection(self, proc_rank, world_size):
method train_dataloader (line 294) | def train_dataloader(self):
method test_dataloader (line 298) | def test_dataloader(self):
method val_dataloader (line 302) | def val_dataloader(self):
method on_load_checkpoint (line 305) | def on_load_checkpoint(self, checkpoint):
method on_save_checkpoint (line 308) | def on_save_checkpoint(self, checkpoint):
method on_sanity_check_start (line 311) | def on_sanity_check_start(self):
method on_train_start (line 314) | def on_train_start(self):
method on_train_end (line 317) | def on_train_end(self):
method on_batch_start (line 320) | def on_batch_start(self, batch):
method on_batch_end (line 323) | def on_batch_end(self):
method on_pre_performance_check (line 326) | def on_pre_performance_check(self):
method on_post_performance_check (line 329) | def on_post_performance_check(self):
method on_before_zero_grad (line 332) | def on_before_zero_grad(self, optimizer):
method on_after_backward (line 335) | def on_after_backward(self):
method backward (line 338) | def backward(self, loss, optimizer):
method grad_norm (line 341) | def grad_norm(self, norm_type):
FILE: NeuralSeq/tasks/run.py
function run_task (line 5) | def run_task():
FILE: NeuralSeq/tasks/svs/diffsinger_task.py
class DiffSingerTask (line 30) | class DiffSingerTask(DiffSpeechTask):
method __init__ (line 31) | def __init__(self):
method build_tts_model (line 40) | def build_tts_model(self):
method validation_step (line 66) | def validation_step(self, sample, batch_idx):
class ShallowDiffusionOfflineDataset (line 102) | class ShallowDiffusionOfflineDataset(FastSpeechDataset):
method __getitem__ (line 103) | def __getitem__(self, index):
method collater (line 114) | def collater(self, samples):
class DiffSingerOfflineTask (line 121) | class DiffSingerOfflineTask(DiffSingerTask):
method __init__ (line 122) | def __init__(self):
method build_tts_model (line 126) | def build_tts_model(self):
method run_model (line 140) | def run_model(self, model, sample, return_output=False, infer=False):
method validation_step (line 172) | def validation_step(self, sample, batch_idx):
method test_step (line 208) | def test_step(self, sample, batch_idx):
class MIDIDataset (line 237) | class MIDIDataset(FastSpeechDataset):
method __getitem__ (line 238) | def __getitem__(self, index):
method collater (line 246) | def collater(self, samples):
class OpencpopDataset (line 254) | class OpencpopDataset(FastSpeechDataset):
method __getitem__ (line 255) | def __getitem__(self, index):
method collater (line 264) | def collater(self, samples):
class DiffSingerMIDITask (line 273) | class DiffSingerMIDITask(DiffSingerTask):
method __init__ (line 274) | def __init__(self):
method run_model (line 279) | def run_model(self, model, sample, return_output=False, infer=False):
method validation_step (line 316) | def validation_step(self, sample, batch_idx):
method add_dur_loss (line 351) | def add_dur_loss(self, dur_pred, mel2ph, txt_tokens, wdb, losses=None):
class AuxDecoderMIDITask (line 392) | class AuxDecoderMIDITask(FastSpeech2Task):
method __init__ (line 393) | def __init__(self):
method build_tts_model (line 398) | def build_tts_model(self):
method run_model (line 404) | def run_model(self, model, sample, return_output=False):
method add_dur_loss (line 435) | def add_dur_loss(self, dur_pred, mel2ph, txt_tokens, wdb, losses=None):
method validation_step (line 475) | def validation_step(self, sample, batch_idx):
FILE: NeuralSeq/tasks/svs/diffspeech_task.py
class DiffSpeechTask (line 17) | class DiffSpeechTask(DiffFsTask):
method __init__ (line 18) | def __init__(self):
method build_tts_model (line 23) | def build_tts_model(self):
method build_optimizer (line 40) | def build_optimizer(self, model):
method run_model (line 48) | def run_model(self, model, sample, return_output=False, infer=False):
method validation_step (line 80) | def validation_step(self, sample, batch_idx):
method plot_wav (line 112) | def plot_wav(self, batch_idx, gt_wav, wav_out, is_mel=False, gt_f0=Non...
FILE: NeuralSeq/tasks/svs/task.py
class DiffFsTask (line 15) | class DiffFsTask(FastSpeech2Task):
method build_tts_model (line 16) | def build_tts_model(self):
method run_model (line 26) | def run_model(self, model, sample, return_output=False, infer=False):
method _training_step (line 56) | def _training_step(self, sample, batch_idx, _):
method validation_step (line 63) | def validation_step(self, sample, batch_idx):
method build_scheduler (line 75) | def build_scheduler(self, optimizer):
method optimizer_step (line 78) | def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx):
FILE: NeuralSeq/tasks/tts/dataset_utils.py
class BaseTTSDataset (line 19) | class BaseTTSDataset(BaseDataset):
method __init__ (line 20) | def __init__(self, prefix, shuffle=False, test_items=None, test_sizes=...
method _get_item (line 52) | def _get_item(self, index):
method __getitem__ (line 59) | def __getitem__(self, index):
method collater (line 82) | def collater(self, samples):
class FastSpeechDataset (line 114) | class FastSpeechDataset(BaseTTSDataset):
method __init__ (line 115) | def __init__(self, prefix, shuffle=False, test_items=None, test_sizes=...
method __getitem__ (line 137) | def __getitem__(self, index):
method collater (line 187) | def collater(self, samples):
method load_test_inputs (line 212) | def load_test_inputs(self):
class FastSpeechWordDataset (line 238) | class FastSpeechWordDataset(FastSpeechDataset):
method __getitem__ (line 239) | def __getitem__(self, index):
method collater (line 249) | def collater(self, samples):
FILE: NeuralSeq/tasks/tts/fs2.py
class FastSpeech2Task (line 27) | class FastSpeech2Task(TtsTask):
method __init__ (line 28) | def __init__(self):
method train_dataloader (line 47) | def train_dataloader(self):
method val_dataloader (line 53) | def val_dataloader(self):
method test_dataloader (line 58) | def test_dataloader(self):
method build_tts_model (line 63) | def build_tts_model(self):
method build_model (line 66) | def build_model(self):
method _training_step (line 73) | def _training_step(self, sample, batch_idx, _):
method validation_step (line 79) | def validation_step(self, sample, batch_idx):
method _validation_end (line 96) | def _validation_end(self, outputs):
method run_model (line 109) | def run_model(self, model, sample, return_output=False):
method add_mel_loss (line 141) | def add_mel_loss(self, mel_out, target, losses, postfix='', mel_mix_lo...
method l1_loss (line 156) | def l1_loss(self, decoder_output, target):
method ssim_loss (line 164) | def ssim_loss(self, decoder_output, target, bias=6.0):
method add_dur_loss (line 175) | def add_dur_loss(self, dur_pred, mel2ph, txt_tokens, losses=None):
method add_pitch_loss (line 219) | def add_pitch_loss(self, output, sample, losses):
method add_f0_loss (line 252) | def add_f0_loss(self, p_pred, f0, uv, losses, nonpadding):
method cwt_loss (line 269) | def cwt_loss(self, cwt_p, cwt_g):
method add_energy_loss (line 277) | def add_energy_loss(self, energy_pred, energy, losses):
method plot_mel (line 287) | def plot_mel(self, batch_idx, spec, spec_out, name=None):
method plot_dur (line 294) | def plot_dur(self, batch_idx, sample, model_out):
method plot_pitch (line 303) | def plot_pitch(self, batch_idx, sample, model_out):
method test_step (line 338) | def test_step(self, sample, batch_idx):
method after_infer (line 369) | def after_infer(self, predictions):
method save_result (line 459) | def save_result(wav_out, mel, prefix, item_name, text, gen_dir, str_ph...
method expand_f0_ph (line 501) | def expand_f0_ph(f0, mel2ph):
FILE: NeuralSeq/tasks/tts/fs2_adv.py
class FastSpeech2AdvTask (line 11) | class FastSpeech2AdvTask(FastSpeech2Task):
method build_model (line 12) | def build_model(self):
method build_disc_model (line 22) | def build_disc_model(self):
method _training_step (line 32) | def _training_step(self, sample, batch_idx, optimizer_idx):
method configure_optimizers (line 95) | def configure_optimizers(self):
method build_scheduler (line 111) | def build_scheduler(self, optimizer):
method on_before_optimization (line 119) | def on_before_optimization(self, opt_idx):
method on_after_optimization (line 125) | def on_after_optimization(self, epoch, batch_idx, optimizer, optimizer...
FILE: NeuralSeq/tasks/tts/fs2_utils.py
class FastSpeechDataset (line 23) | class FastSpeechDataset(BaseDataset):
method __init__ (line 24) | def __init__(self, prefix, shuffle=False):
method _get_item (line 53) | def _get_item(self, index):
method __getitem__ (line 60) | def __getitem__(self, index):
method collater (line 105) | def collater(self, samples):
method load_test_inputs (line 154) | def load_test_inputs(self, test_input_dir, spk_id=0):
FILE: NeuralSeq/tasks/tts/pe.py
class PeDataset (line 18) | class PeDataset(BaseDataset):
method __init__ (line 19) | def __init__(self, prefix, shuffle=False):
method _get_item (line 41) | def _get_item(self, index):
method __getitem__ (line 48) | def __getitem__(self, index):
method collater (line 70) | def collater(self, samples):
class PitchExtractionTask (line 101) | class PitchExtractionTask(FastSpeech2Task):
method __init__ (line 102) | def __init__(self):
method build_tts_model (line 106) | def build_tts_model(self):
method _training_step (line 111) | def _training_step(self, sample, batch_idx, _):
method validation_step (line 117) | def validation_step(self, sample, batch_idx):
method run_model (line 128) | def run_model(self, model, sample, return_output=False, infer=False):
method plot_pitch (line 139) | def plot_pitch(self, batch_idx, model_out, sample):
method add_pitch_loss (line 146) | def add_pitch_loss(self, output, sample, losses):
FILE: NeuralSeq/tasks/tts/ps.py
class PortaSpeechTask (line 18) | class PortaSpeechTask(FastSpeech2Task):
method __init__ (line 19) | def __init__(self):
method build_tts_model (line 24) | def build_tts_model(self):
method on_train_start (line 29) | def on_train_start(self):
method run_model (line 37) | def run_model(self, sample, infer=False, *args, **kwargs):
method add_dur_loss (line 85) | def add_dur_loss(self, dur_pred, mel2token, word_len, txt_tokens, loss...
method validation_step (line 101) | def validation_step(self, sample, batch_idx):
method save_valid_result (line 104) | def save_valid_result(self, sample, batch_idx, model_out):
method get_attn_stats (line 109) | def get_attn_stats(self, attn, sample, logging_outputs, prefix=''):
method get_plot_dur_info (line 127) | def get_plot_dur_info(self, sample, model_out):
method build_optimizer (line 141) | def build_optimizer(self, model):
method build_scheduler (line 149) | def build_scheduler(self, optimizer):
method test_start (line 155) | def test_start(self):
method test_step (line 161) | def test_step(self, sample, batch_idx):
FILE: NeuralSeq/tasks/tts/ps_adv.py
class PortaSpeechAdvTask (line 21) | class PortaSpeechAdvTask(FastSpeech2Task):
method __init__ (line 22) | def __init__(self):
method build_tts_model (line 29) | def build_tts_model(self):
method build_disc_model (line 42) | def build_disc_model(self):
method on_train_start (line 51) | def on_train_start(self):
method _training_step (line 59) | def _training_step(self, sample, batch_idx, optimizer_idx):
method run_model (line 104) | def run_model(self, sample, infer=False, *args, **kwargs):
method add_dur_loss (line 162) | def add_dur_loss(self, dur_pred, mel2token, word_len, txt_tokens, loss...
method validation_step (line 192) | def validation_step(self, sample, batch_idx):
method save_valid_result (line 213) | def save_valid_result(self, sample, batch_idx, model_out):
method get_attn_stats (line 253) | def get_attn_stats(self, attn, sample, logging_outputs, prefix=''):
method get_plot_dur_info (line 271) | def get_plot_dur_info(self, sample, model_out):
method build_optimizer (line 285) | def build_optimizer(self, model):
method build_scheduler (line 301) | def build_scheduler(self, optimizer):
method on_before_optimization (line 308) | def on_before_optimization(self, opt_idx):
method on_after_optimization (line 319) | def on_after_optimization(self, epoch, batch_idx, optimizer, optimizer...
method test_start (line 327) | def test_start(self):
method test_step (line 333) | def test_step(self, sample, batch_idx):
FILE: NeuralSeq/tasks/tts/ps_flow.py
class PortaSpeechFlowTask (line 9) | class PortaSpeechFlowTask(PortaSpeechTask):
method __init__ (line 10) | def __init__(self):
method build_tts_model (line 14) | def build_tts_model(self):
method _training_step (line 19) | def _training_step(self, sample, batch_idx, opt_idx):
method run_model (line 32) | def run_model(self, sample, infer=False, *args, **kwargs):
method validation_step (line 94) | def validation_step(self, sample, batch_idx):
method save_valid_result (line 99) | def save_valid_result(self, sample, batch_idx, model_out):
method build_optimizer (line 113) | def build_optimizer(self, model):
method build_scheduler (line 134) | def build_scheduler(self, optimizer):
FILE: NeuralSeq/tasks/tts/synta.py
class SyntaSpeechTask (line 11) | class SyntaSpeechTask(PortaSpeechAdvTask):
method build_tts_model (line 12) | def build_tts_model(self):
FILE: NeuralSeq/tasks/tts/tts.py
class TtsTask (line 28) | class TtsTask(BaseTask):
method __init__ (line 29) | def __init__(self, *args, **kwargs):
method build_scheduler (line 40) | def build_scheduler(self, optimizer):
method build_optimizer (line 43) | def build_optimizer(self, model):
method build_dataloader (line 49) | def build_dataloader(self, dataset, shuffle, max_tokens=None, max_sent...
method build_phone_encoder (line 95) | def build_phone_encoder(self, data_dir):
method build_optimizer (line 101) | def build_optimizer(self, model):
method test_start (line 107) | def test_start(self):
method test_end (line 115) | def test_end(self, outputs):
method weights_nonzero_speech (line 124) | def weights_nonzero_speech(self, target):
FILE: NeuralSeq/tasks/tts/tts_base.py
class TTSBaseTask (line 34) | class TTSBaseTask(BaseTask):
method __init__ (line 35) | def __init__(self, *args, **kwargs):
method train_dataloader (line 56) | def train_dataloader(self):
method val_dataloader (line 80) | def val_dataloader(self):
method test_dataloader (line 85) | def test_dataloader(self):
method build_dataloader (line 92) | def build_dataloader(self, dataset, shuffle, max_tokens=None, max_sent...
method build_phone_encoder (line 138) | def build_phone_encoder(self, data_dir):
method build_scheduler (line 143) | def build_scheduler(self, optimizer):
method build_optimizer (line 149) | def build_optimizer(self, model):
method plot_mel (line 157) | def plot_mel(self, batch_idx, spec, spec_out, name=None):
method test_start (line 164) | def test_start(self):
method after_infer (line 173) | def after_infer(self, predictions, sil_start_frame=0):
method save_result (line 247) | def save_result(wav_out, mel, base_fn, gen_dir, str_phs=None, mel2ph=N...
method test_end (line 284) | def test_end(self, outputs):
method weights_nonzero_speech (line 294) | def weights_nonzero_speech(self, target):
method make_stop_target (line 300) | def make_stop_target(self, target):
FILE: NeuralSeq/tasks/tts/tts_utils.py
function parse_dataset_configs (line 9) | def parse_dataset_configs():
function parse_mel_losses (line 21) | def parse_mel_losses():
function load_data_preprocessor (line 37) | def load_data_preprocessor():
function load_data_binarizer (line 47) | def load_data_binarizer():
FILE: NeuralSeq/tasks/vocoder/dataset_utils.py
class EndlessDistributedSampler (line 15) | class EndlessDistributedSampler(DistributedSampler):
method __init__ (line 16) | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
method __iter__ (line 42) | def __iter__(self):
method __len__ (line 45) | def __len__(self):
class VocoderDataset (line 49) | class VocoderDataset(BaseDataset):
method __init__ (line 50) | def __init__(self, prefix, shuffle=False):
method _get_item (line 74) | def _get_item(self, index):
method __getitem__ (line 80) | def __getitem__(self, index):
method collater (line 100) | def collater(self, batch):
method _assert_ready_for_upsampling (line 163) | def _assert_ready_for_upsampling(x, c, hop_size, context_window):
method load_test_inputs (line 167) | def load_test_inputs(self, test_input_dir, spk_id=0):
method load_mel_inputs (line 186) | def load_mel_inputs(self, test_input_dir, spk_id=0):
FILE: NeuralSeq/tasks/vocoder/vocoder_base.py
class VocoderBaseTask (line 13) | class VocoderBaseTask(BaseTask):
method __init__ (line 14) | def __init__(self):
method train_dataloader (line 23) | def train_dataloader(self):
method val_dataloader (line 28) | def val_dataloader(self):
method test_dataloader (line 33) | def test_dataloader(self):
method build_dataloader (line 37) | def build_dataloader(self, dataset, shuffle, max_sentences, endless=Fa...
method test_start (line 60) | def test_start(self):
method test_end (line 65) | def test_end(self, outputs):
FILE: NeuralSeq/utils/__init__.py
function tensors_to_scalars (line 17) | def tensors_to_scalars(metrics):
class AvgrageMeter (line 28) | class AvgrageMeter(object):
method __init__ (line 30) | def __init__(self):
method reset (line 33) | def reset(self):
method update (line 38) | def update(self, val, n=1):
function collate_1d (line 44) | def collate_1d(values, pad_idx=0, left_pad=False, shift_right=False, max...
function collate_2d (line 62) | def collate_2d(values, pad_idx=0, left_pad=False, shift_right=False, max...
function _is_batch_full (line 79) | def _is_batch_full(batch, num_tokens, max_tokens, max_sentences):
function batch_by_size (line 89) | def batch_by_size(
function make_positions (line 145) | def make_positions(tensor, padding_idx):
function softmax (line 160) | def softmax(x, dim):
function unpack_dict_to_list (line 164) | def unpack_dict_to_list(samples):
function load_ckpt (line 178) | def load_ckpt(cur_model, ckpt_base_dir, prefix_in_ckpt='model', force=Tr...
function remove_padding (line 212) | def remove_padding(x, padding_idx=0):
class Timer (line 222) | class Timer:
method __init__ (line 225) | def __init__(self, name, print_time=False):
method __enter__ (line 231) | def __enter__(self):
method __exit__ (line 234) | def __exit__(self, exc_type, exc_val, exc_tb):
function print_arch (line 240) | def print_arch(model, model_name='model'):
function num_params (line 245) | def num_params(model, print_out=True, model_name="model"):
FILE: NeuralSeq/utils/audio.py
function save_wav (line 12) | def save_wav(wav, path, sr, norm=False):
function get_hop_size (line 20) | def get_hop_size(hparams):
function _stft (line 29) | def _stft(y, hparams):
function _istft (line 34) | def _istft(y, hparams):
function librosa_pad_lr (line 38) | def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
function amp_to_db (line 51) | def amp_to_db(x):
function normalize (line 55) | def normalize(S, hparams):
function denormalize (line 58) | def denormalize(D, hparams):
function rnnoise (line 60) | def rnnoise(filename, out_fn=None, verbose=False, out_sample_rate=22050):
FILE: NeuralSeq/utils/ckpt_utils.py
function get_last_checkpoint (line 8) | def get_last_checkpoint(work_dir, steps=None):
function get_all_ckpts (line 19) | def get_all_ckpts(work_dir, steps=None):
function load_ckpt (line 28) | def load_ckpt(cur_model, ckpt_base_dir, model_name='model', force=True, ...
FILE: NeuralSeq/utils/cwt.py
function load_wav (line 7) | def load_wav(wav_file, sr):
function convert_continuos_f0 (line 12) | def convert_continuos_f0(f0):
function get_cont_lf0 (line 46) | def get_cont_lf0(f0, frame_period=5.0):
function get_lf0_cwt (line 53) | def get_lf0_cwt(lf0):
function norm_scale (line 72) | def norm_scale(Wavelet_lf0):
function normalize_cwt_lf0 (line 80) | def normalize_cwt_lf0(f0, mean, std):
function get_lf0_cwt_norm (line 89) | def get_lf0_cwt_norm(f0s, mean, std):
function inverse_cwt_torch (line 118) | def inverse_cwt_torch(Wavelet_lf0, scales):
function inverse_cwt (line 127) | def inverse_cwt(Wavelet_lf0, scales):
function cwt2f0 (line 135) | def cwt2f0(cwt_spec, mean, std, cwt_scales):
FILE: NeuralSeq/utils/dtw.py
function dtw (line 6) | def dtw(x, y, dist, warp=1, w=inf, s=1.0):
function accelerated_dtw (line 58) | def accelerated_dtw(x, y, dist, warp=1):
function _traceback (line 100) | def _traceback(D):
FILE: NeuralSeq/utils/hparams.py
class Args (line 9) | class Args:
method __init__ (line 10) | def __init__(self, **kwargs):
function override_config (line 15) | def override_config(old_config: dict, new_config: dict):
function set_hparams (line 23) | def set_hparams(config='', exp_name='', hparams_str='', print_hparams=Tr...
FILE: NeuralSeq/utils/indexed_datasets.py
class IndexedDataset (line 7) | class IndexedDataset:
method __init__ (line 8) | def __init__(self, path, num_cache=1):
method check_index (line 17) | def check_index(self, i):
method __del__ (line 21) | def __del__(self):
method __getitem__ (line 25) | def __getitem__(self, i):
method __len__ (line 38) | def __len__(self):
class IndexedDatasetBuilder (line 41) | class IndexedDatasetBuilder:
method __init__ (line 42) | def __init__(self, path):
method add_item (line 47) | def add_item(self, item):
method finalize (line 52) | def finalize(self):
FILE: NeuralSeq/utils/multiprocess_utils.py
function chunked_worker (line 6) | def chunked_worker(worker_id, map_func, args, results_queue=None, init_c...
function chunked_multiprocess_run (line 19) | def chunked_multiprocess_run(map_func, args, num_workers=None, ordered=T...
function multiprocess_run_tqdm (line 49) | def multiprocess_run_tqdm(map_func, args, num_workers=None, ordered=True...
FILE: NeuralSeq/utils/os_utils.py
function link_file (line 5) | def link_file(from_file, to_file):
function move_file (line 10) | def move_file(from_file, to_file):
function copy_file (line 14) | def copy_file(from_file, to_file):
function remove_file (line 18) | def remove_file(*fns):
FILE: NeuralSeq/utils/pitch_utils.py
function f0_to_coarse (line 22) | def f0_to_coarse(f0):
function norm_f0 (line 34) | def norm_f0(f0, uv, hparams):
function norm_interp_f0 (line 45) | def norm_interp_f0(f0, hparams):
function denorm_f0 (line 63) | def denorm_f0(f0, uv, hparams, pitch_padding=None, min=None, max=None):
FILE: NeuralSeq/utils/pl_utils.py
function get_a_var (line 32) | def get_a_var(obj): # pragma: no cover
function data_loader (line 47) | def data_loader(fn):
function parallel_apply (line 80) | def parallel_apply(modules, inputs, kwargs_tup=None, devices=None): # p...
function _find_tensors (line 166) | def _find_tensors(obj): # pragma: no cover
class DDP (line 179) | class DDP(DistributedDataParallel):
method parallel_apply (line 184) | def parallel_apply(self, replicas, inputs, kwargs):
method forward (line 187) | def forward(self, *inputs, **kwargs): # pragma: no cover
class DP (line 224) | class DP(DataParallel):
method forward (line 229) | def forward(self, *inputs, **kwargs):
method parallel_apply (line 253) | def parallel_apply(self, replicas, inputs, kwargs):
class GradientAccumulationScheduler (line 257) | class GradientAccumulationScheduler:
method __init__ (line 258) | def __init__(self, scheduling: dict):
method on_epoch_begin (line 276) | def on_epoch_begin(self, epoch, trainer):
class LatestModelCheckpoint (line 284) | class LatestModelCheckpoint(ModelCheckpoint):
method __init__ (line 285) | def __init__(self, filepath, monitor='val_loss', verbose=0, num_ckpt_k...
method get_all_ckpts (line 323) | def get_all_ckpts(self):
method on_epoch_end (line 327) | def on_epoch_end(self, epoch, logs=None):
class BaseTrainer (line 354) | class BaseTrainer:
method __init__ (line 355) | def __init__(
method num_gpus (line 458) | def num_gpus(self):
method data_parallel (line 466) | def data_parallel(self):
method get_model (line 469) | def get_model(self):
method fit (line 477) | def fit(self, model):
method init_optimizers (line 492) | def init_optimizers(self, optimizers):
method run_pretrain_routine (line 507) | def run_pretrain_routine(self, model):
method test (line 584) | def test(self, model):
method training_tqdm_dict (line 589) | def training_tqdm_dict(self):
method restore_weights (line 599) | def restore_weights(self, model):
method restore_state_if_checkpoint_exists (line 628) | def restore_state_if_checkpoint_exists(self, model):
method restore (line 661) | def restore(self, checkpoint_path, on_gpu):
method restore_training_state (line 683) | def restore_training_state(self, checkpoint):
method _atomic_save (line 722) | def _atomic_save(self, checkpoint, filepath):
method save_checkpoint (line 739) | def save_checkpoint(self, filepath):
method dump_checkpoint (line 743) | def dump_checkpoint(self):
method copy_trainer_model_properties (line 776) | def copy_trainer_model_properties(self, model):
method transfer_batch_to_gpu (line 792) | def transfer_batch_to_gpu(self, batch, gpu_id):
method set_distributed_mode (line 823) | def set_distributed_mode(self, distributed_backend):
method ddp_train (line 847) | def ddp_train(self, gpu_idx, model):
method resolve_root_node_address (line 905) | def resolve_root_node_address(self, root_node):
method log_metrics (line 917) | def log_metrics(self, metrics, grad_norm_dic, step=None):
method add_tqdm_metrics (line 938) | def add_tqdm_metrics(self, metrics):
method metrics_to_scalars (line 945) | def metrics_to_scalars(self, metrics):
method process_output (line 958) | def process_output(self, output, train=False):
method reduce_distributed_output (line 1050) | def reduce_distributed_output(self, output, num_gpus):
method clip_gradients (line 1074) | def clip_gradients(self):
method print_nan_gradients (line 1079) | def print_nan_gradients(self):
method configure_accumulated_gradients (line 1085) | def configure_accumulated_gradients(self, accumulate_grad_batches):
method get_dataloaders (line 1096) | def get_dataloaders(self, model):
method init_train_dataloader (line 1111) | def init_train_dataloader(self, model):
method init_val_dataloader (line 1127) | def init_val_dataloader(self, model):
method init_test_dataloader (line 1137) | def init_test_dataloader(self, model):
method evaluate (line 1146) | def evaluate(self, model, dataloaders, max_batches, test=False):
method run_evaluation (line 1221) | def run_evaluation(self, test=False):
method evaluation_forward (line 1281) | def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, ...
method train (line 1313) | def train(self):
method run_training_epoch (line 1371) | def run_training_epoch(self):
method run_training_batch (line 1436) | def run_training_batch(self, batch, batch_idx):
method training_forward (line 1564) | def training_forward(self, batch, batch_idx, opt_idx, hiddens):
method is_function_implemented (line 1606) | def is_function_implemented(self, f_name):
method _percent_range_check (line 1611) | def _percent_range_check(self, name):
FILE: NeuralSeq/utils/plot.py
function spec_to_figure (line 8) | def spec_to_figure(spec, vmin=None, vmax=None):
function spec_f0_to_figure (line 16) | def spec_f0_to_figure(spec, f0s, figsize=None):
function dur_to_figure (line 30) | def dur_to_figure(dur_gt, dur_pred, txt):
function f0_to_figure (line 45) | def f0_to_figure(f0_gt, f0_cwt=None, f0_pred=None):
FILE: NeuralSeq/utils/text_encoder.py
function strip_ids (line 28) | def strip_ids(ids, ids_to_strip):
class TextEncoder (line 36) | class TextEncoder(object):
method __init__ (line 39) | def __init__(self, num_reserved_ids=NUM_RESERVED_TOKENS):
method num_reserved_ids (line 43) | def num_reserved_ids(self):
method encode (line 46) | def encode(self, s):
method decode (line 62) | def decode(self, ids, strip_extraneous=False):
method decode_list (line 79) | def decode_list(self, ids):
method vocab_size (line 101) | def vocab_size(self):
class ByteTextEncoder (line 105) | class ByteTextEncoder(TextEncoder):
method encode (line 108) | def encode(self, s):
method decode (line 117) | def decode(self, ids, strip_extraneous=False):
method decode_list (line 133) | def decode_list(self, ids):
method vocab_size (line 146) | def vocab_size(self):
class ByteTextEncoderWithEos (line 150) | class ByteTextEncoderWithEos(ByteTextEncoder):
method encode (line 153) | def encode(self, s):
class TokenTextEncoder (line 157) | class TokenTextEncoder(TextEncoder):
method __init__ (line 160) | def __init__(self,
method encode (line 197) | def encode(self, s):
method decode (line 207) | def decode(self, ids, strip_eos=False, strip_padding=False):
method decode_list (line 216) | def decode_list(self, ids):
method vocab_size (line 221) | def vocab_size(self):
method __len__ (line 224) | def __len__(self):
method _safe_id_to_token (line 227) | def _safe_id_to_token(self, idx):
method _init_vocab_from_file (line 230) | def _init_vocab_from_file(self, filename):
method _init_vocab_from_list (line 245) | def _init_vocab_from_list(self, vocab_list):
method _init_vocab (line 261) | def _init_vocab(self, token_generator, add_reserved_tokens=True):
method pad (line 278) | def pad(self):
method eos (line 281) | def eos(self):
method unk (line 284) | def unk(self):
method seg (line 287) | def seg(self):
method store_to_file (line 290) | def store_to_file(self, filename):
method sil_phonemes (line 303) | def sil_phonemes(self):
FILE: NeuralSeq/utils/text_norm.py
class ChineseChar (line 57) | class ChineseChar(object):
method __init__ (line 65) | def __init__(self, simplified, traditional):
method __str__ (line 70) | def __str__(self):
method __repr__ (line 73) | def __repr__(self):
class ChineseNumberUnit (line 77) | class ChineseNumberUnit(ChineseChar):
method __init__ (line 84) | def __init__(self, power, simplified, traditional, big_s, big_t):
method __str__ (line 90) | def __str__(self):
method create (line 94) | def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small...
class ChineseNumberDigit (line 113) | class ChineseNumberDigit(ChineseChar):
method __init__ (line 118) | def __init__(self, value, simplified, traditional, big_s, big_t, alt_s...
method __str__ (line 126) | def __str__(self):
method create (line 130) | def create(cls, i, v):
class ChineseMath (line 134) | class ChineseMath(ChineseChar):
method __init__ (line 139) | def __init__(self, simplified, traditional, symbol, expression=None):
class NumberSystem (line 150) | class NumberSystem(object):
class MathSymbol (line 157) | class MathSymbol(object):
method __init__ (line 165) | def __init__(self, positive, negative, point):
method __iter__ (line 170) | def __iter__(self):
function create_system (line 191) | def create_system(numbering_type=NUMBERING_TYPES[1]):
function chn2num (line 233) | def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
function num2chn (line 319) | def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
class Cardinal (line 419) | class Cardinal:
method __init__ (line 424) | def __init__(self, cardinal=None, chntext=None):
method chntext2cardinal (line 428) | def chntext2cardinal(self):
method cardinal2chntext (line 431) | def cardinal2chntext(self):
class Digit (line 435) | class Digit:
method __init__ (line 440) | def __init__(self, digit=None, chntext=None):
method digit2chntext (line 447) | def digit2chntext(self):
class TelePhone (line 451) | class TelePhone:
method __init__ (line 456) | def __init__(self, telephone=None, raw_chntext=None, chntext=None):
method telephone2chntext (line 468) | def telephone2chntext(self, fixed=False):
class Fraction (line 485) | class Fraction:
method __init__ (line 490) | def __init__(self, fraction=None, chntext=None):
method chntext2fraction (line 494) | def chntext2fraction(self):
method fraction2chntext (line 498) | def fraction2chntext(self):
class Date (line 503) | class Date:
method __init__ (line 508) | def __init__(self, date=None, chntext=None):
method date2chntext (line 536) | def date2chntext(self):
class Money (line 561) | class Money:
method __init__ (line 566) | def __init__(self, money=None, chntext=None):
method money2chntext (line 573) | def money2chntext(self):
class Percentage (line 584) | class Percentage:
method __init__ (line 589) | def __init__(self, percentage=None, chntext=None):
method chntext2percentage (line 593) | def chntext2percentage(self):
method percentage2chntext (line 596) | def percentage2chntext(self):
class NSWNormalizer (line 603) | class NSWNormalizer:
method __init__ (line 604) | def __init__(self, raw_text):
method _particular (line 608) | def _particular(self):
method normalize (line 619) | def normalize(self, remove_punc=True):
function nsw_test_case (line 712) | def nsw_test_case(raw_text):
function nsw_test (line 718) | def nsw_test():
FILE: NeuralSeq/utils/training_utils.py
class RSQRTSchedule (line 4) | class RSQRTSchedule(object):
method __init__ (line 5) | def __init__(self, optimizer):
method step (line 16) | def step(self, num_updates):
method get_lr (line 26) | def get_lr(self):
FILE: NeuralSeq/utils/tts_utils.py
function make_positions (line 6) | def make_positions(tensor, padding_idx):
function softmax (line 21) | def softmax(x, dim):
function sequence_mask (line 25) | def sequence_mask(lengths, maxlen, dtype=torch.bool):
function _get_full_incremental_state_key (line 36) | def _get_full_incremental_state_key(module_instance, key):
function get_incremental_state (line 48) | def get_incremental_state(module, incremental_state, key):
function set_incremental_state (line 56) | def set_incremental_state(module, incremental_state, key, value):
function fill_with_neg_inf (line 63) | def fill_with_neg_inf(t):
function fill_with_neg_inf2 (line 68) | def fill_with_neg_inf2(t):
function get_focus_rate (line 73) | def get_focus_rate(attn, src_padding_mask=None, tgt_padding_mask=None):
function get_phone_coverage_rate (line 88) | def get_phone_coverage_rate(attn, src_padding_mask=None, src_seg_mask=No...
function get_diagonal_focus_rate (line 108) | def get_diagonal_focus_rate(attn, attn_ks, target_len, src_padding_mask=...
function select_attn (line 146) | def select_attn(attn_logits, type='best'):
function make_pad_mask (line 164) | def make_pad_mask(lengths, xs=None, length_dim=-1):
function make_non_pad_mask (line 270) | def make_non_pad_mask(lengths, xs=None, length_dim=-1):
function get_mask_from_lengths (line 350) | def get_mask_from_lengths(lengths):
function group_hidden_by_segs (line 357) | def group_hidden_by_segs(h, seg_ids, max_len):
function mel2token_to_dur (line 373) | def mel2token_to_dur(mel2token, T_txt=None, max_dur=None):
function expand_word2ph (line 394) | def expand_word2ph(word_encoding, ph2word):
FILE: NeuralSeq/vocoders/base_vocoder.py
function register_vocoder (line 5) | def register_vocoder(cls):
function get_vocoder_cls (line 11) | def get_vocoder_cls(hparams):
class BaseVocoder (line 22) | class BaseVocoder:
method spec2wav (line 23) | def spec2wav(self, mel):
method wav2spec (line 33) | def wav2spec(wav_fn):
FILE: NeuralSeq/vocoders/hifigan.py
function load_model (line 17) | def load_model(config_path, checkpoint_path):
class HifiGAN (line 40) | class HifiGAN(PWG):
method __init__ (line 41) | def __init__(self):
method spec2wav (line 55) | def spec2wav(self, mel, **kwargs):
FILE: NeuralSeq/vocoders/pwg.py
function load_pwg_model (line 16) | def load_pwg_model(config_path, checkpoint_path, stats_path):
class PWG (line 54) | class PWG(BaseVocoder):
method __init__ (line 55) | def __init__(self):
method spec2wav (line 82) | def spec2wav(self, mel, **kwargs):
method wav2spec (line 106) | def wav2spec(wav_fn, return_linear=False):
method wav2mfcc (line 125) | def wav2mfcc(wav_fn):
FILE: NeuralSeq/vocoders/vocoder_utils.py
function denoise (line 7) | def denoise(wav, v=0.1):
FILE: audio-chatgpt.py
function cut_dialogue_history (line 77) | def cut_dialogue_history(history_memory, keep_last_n_words = 500):
function merge_audio (line 92) | def merge_audio(audio_path_1, audio_path_2):
class T2I (line 104) | class T2I:
method __init__ (line 105) | def __init__(self, device):
method inference (line 117) | def inference(self, text):
class ImageCaptioning (line 126) | class ImageCaptioning:
method __init__ (line 127) | def __init__(self, device):
method inference (line 134) | def inference(self, image_path):
class T2A (line 140) | class T2A:
method __init__ (line 141) | def __init__(self, device):
method _initialize_model (line 147) | def _initialize_model(self, config, ckpt, device):
method txt2audio (line 158) | def txt2audio(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_...
method select_best_audio (line 185) | def select_best_audio(self, prompt, wav_list):
method inference (line 201) | def inference(self, text, seed = 55, scale = 1.5, ddim_steps = 100, n_...
class I2A (line 214) | class I2A:
method __init__ (line 215) | def __init__(self, device):
method _initialize_model (line 221) | def _initialize_model(self, config, ckpt, device):
method img2audio (line 232) | def img2audio(self, image, seed = 55, scale = 3, ddim_steps = 100, W =...
method inference (line 262) | def inference(self, image, seed = 55, scale = 3, ddim_steps = 100, W =...
class TTS (line 275) | class TTS:
method __init__ (line 276) | def __init__(self, device=None):
method set_model_hparams (line 286) | def set_model_hparams(self):
method inference (line 290) | def inference(self, text):
class T2S (line 298) | class T2S:
method __init__ (line 299) | def __init__(self, device= None):
method set_model_hparams (line 315) | def set_model_hparams(self):
method inference (line 319) | def inference(self, inputs):
class t2s_VISinger (line 341) | class t2s_VISinger:
method __init__ (line 342) | def __init__(self, device=None):
method inference (line 368) | def inference(self, inputs):
class TTS_OOD (line 383) | class TTS_OOD:
method __init__ (line 384) | def __init__(self, device):
method set_model_hparams (line 395) | def set_model_hparams(self):
method inference (line 405) | def inference(self, inputs):
class Inpaint (line 418) | class Inpaint:
method __init__ (line 419) | def __init__(self, device):
method _initialize_model_inpaint (line 426) | def _initialize_model_inpaint(self, config, ckpt):
method make_batch_sd (line 436) | def make_batch_sd(self, mel, mask, num_samples=1):
method gen_mel (line 452) | def gen_mel(self, input_audio_path):
method gen_mel_audio (line 472) | def gen_mel_audio(self, input_audio):
method show_mel_fn (line 492) | def show_mel_fn(self, input_audio_path):
method inpaint (line 500) | def inpaint(self, batch, seed, ddim_steps, num_samples=1, W=512, H=512):
method inference (line 529) | def inference(self, input_audio, mel_and_mask, seed = 55, ddim_steps =...
class ASR (line 560) | class ASR:
method __init__ (line 561) | def __init__(self, device):
method inference (line 566) | def inference(self, audio_path):
method translate_english (line 574) | def translate_english(self, audio_path):
class A2T (line 578) | class A2T:
method __init__ (line 579) | def __init__(self, device):
method inference (line 584) | def inference(self, audio_path):
class GeneFace (line 589) | class GeneFace:
method __init__ (line 590) | def __init__(self, device=None):
method inference (line 599) | def inference(self, audio_path):
class SoundDetection (line 612) | class SoundDetection:
method __init__ (line 613) | def __init__(self, device):
method inference (line 634) | def inference(self, audio_path):
class SoundExtraction (line 675) | class SoundExtraction:
method __init__ (line 676) | def __init__(self, device):
method inference (line 689) | def inference(self, inputs):
class Binaural (line 713) | class Binaural:
method __init__ (line 714) | def __init__(self, device):
method inference (line 729) | def inference(self, audio_path):
class TargetSoundDetection (line 775) | class TargetSoundDetection:
method __init__ (line 776) | def __init__(self, device):
method extract_feature (line 807) | def extract_feature(self, fname):
method build_clip (line 818) | def build_clip(self, text):
method cal_similarity (line 823) | def cal_similarity(self, target, retrievals):
method inference (line 833) | def inference(self, text, audio_path):
class Speech_Enh_SS_SC (line 957) | class Speech_Enh_SS_SC:
method __init__ (line 963) | def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chi...
method _initialize_model (line 969) | def _initialize_model(self):
method inference (line 989) | def inference(self, speech_path, ref_channel=0):
class Speech_SS (line 1009) | class Speech_SS:
method __init__ (line 1010) | def __init__(self, device="cuda", model_name="lichenda/wsj0_2mix_skim_...
method _initialize_model (line 1016) | def _initialize_model(self):
method inference (line 1036) | def inference(self, speech_path):
class ConversationBot (line 1051) | class ConversationBot:
method __init__ (line 1052) | def __init__(self):
method init_tools (line 1075) | def init_tools(self, interaction_type):
method run_text (line 1197) | def run_text(self, text, state):
method run_image_or_audio (line 1250) | def run_image_or_audio(self, file, state, txt):
method speech (line 1294) | def speech(self, speech_input, state):
method inpainting (line 1351) | def inpainting(self, state, audio_filename, image_filename):
method clear_audio (line 1364) | def clear_audio(self):
method clear_input_audio (line 1366) | def clear_input_audio(self):
method clear_image (line 1368) | def clear_image(self):
method clear_video (line 1370) | def clear_video(self):
method clear_button (line 1372) | def clear_button(self):
FILE: audio_detection/audio_infer/pytorch/evaluate.py
class Evaluator (line 6) | class Evaluator(object):
method __init__ (line 7) | def __init__(self, model):
method evaluate (line 15) | def evaluate(self, data_loader):
FILE: audio_detection/audio_infer/pytorch/finetune_template.py
class Transfer_Cnn14 (line 25) | class Transfer_Cnn14(nn.Module):
method __init__ (line 26) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 46) | def init_weights(self):
method load_from_pretrain (line 49) | def load_from_pretrain(self, pretrained_checkpoint_path):
method forward (line 53) | def forward(self, input, mixup_lambda=None):
function train (line 65) | def train(args):
FILE: audio_detection/audio_infer/pytorch/inference.py
function audio_tagging (line 15) | def audio_tagging(args):
function sound_event_detection (line 80) | def sound_event_detection(args):
FILE: audio_detection/audio_infer/pytorch/losses.py
function clip_bce (line 5) | def clip_bce(output_dict, target_dict):
function get_loss_func (line 12) | def get_loss_func(loss_type):
FILE: audio_detection/audio_infer/pytorch/main.py
function train (line 50) | def train(args):
FILE: audio_detection/audio_infer/pytorch/models.py
function load_checkpoint (line 43) | def load_checkpoint(model,
function init_layer (line 97) | def init_layer(layer):
function init_bn (line 106) | def init_bn(bn):
class TimeShift (line 114) | class TimeShift(nn.Module):
method __init__ (line 115) | def __init__(self, mean, std):
method forward (line 120) | def forward(self, x):
class LinearSoftPool (line 126) | class LinearSoftPool(nn.Module):
method __init__ (line 133) | def __init__(self, pooldim=1):
method forward (line 137) | def forward(self, logits, time_decision):
class PVT (line 141) | class PVT(nn.Module):
method __init__ (line 142) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 195) | def init_weights(self):
method forward (line 199) | def forward(self, input, mixup_lambda=None):
class PVT2 (line 239) | class PVT2(nn.Module):
method __init__ (line 240) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 292) | def init_weights(self):
method forward (line 296) | def forward(self, input, mixup_lambda=None):
class PVT_2layer (line 333) | class PVT_2layer(nn.Module):
method __init__ (line 334) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 387) | def init_weights(self):
method forward (line 391) | def forward(self, input, mixup_lambda=None):
class PVT_lr (line 431) | class PVT_lr(nn.Module):
method __init__ (line 432) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 484) | def init_weights(self):
method forward (line 488) | def forward(self, input, mixup_lambda=None):
class PVT_nopretrain (line 525) | class PVT_nopretrain(nn.Module):
method __init__ (line 526) | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
method init_weights (line 578) | def init_weights(self):
method forward (line 582) | def forward(self, input, mixup_lambda=None):
class Mlp (line 619) | class Mlp(nn.Module):
method __init__ (line 620) | def __init__(self, in_features, hidden_features=None, out_features=Non...
method _init_weights (line 634) | def _init_weights(self, m):
method forward (line 649) | def forward(self, x, H, W):
class Attention (line 661) | class Attention(nn.Module):
method __init__ (line 662) | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, at...
method _init_weights (line 690) | def _init_weights(self, m):
method forward (line 705) | def forward(self, x, H, W):
class Pooling (line 736) | class Pooling(nn.Module):
method __init__ (line 741) | def __init__(self, pool_size=3):
method forward (line 746) | def forward(self, x):
class Block (line 749) | class Block(nn.Module):
method __init__ (line 751) | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_sc...
method _init_weights (line 768) | def _init_weights(self, m):
method forward (line 783) | def forward(self, x, H, W):
class OverlapPatchEmbed (line 789) | class OverlapPatchEmbed(nn.Module):
method __init__ (line 793) | def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, emb...
method _init_weights (line 808) | def _init_weights(self, m):
method forward (line 823) | def forward(self, x):
class PyramidVisionTransformerV2 (line 832) | class PyramidVisionTransformerV2(nn.Module):
method __init__ (line 833) | def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_cha...
method _init_weights (line 871) | def _init_weights(self, m):
method init_weights (line 886) | def init_weights(self, pretrained=None):
method freeze_patch_emb (line 891) | def freeze_patch_emb(self):
method no_weight_decay (line 895) | def no_weight_decay(self):
method get_classifier (line 898) | def get_classifier(self):
method reset_classifier (line 901) | def reset_classifier(self, num_classes, global_pool=''):
method forward_features (line 905) | def forward_features(self, x):
method forward (line 923) | def forward(self, x):
class DWConv (line 929) | class DWConv(nn.Module):
method __init__ (line 930) | def __init__(self, dim=768):
method forward (line 934) | def forward(self, x, H, W):
function _conv_filter (line 943) | def _conv_filter(state_dict, patch_size=16):
FILE: audio_detection/audio_infer/pytorch/pytorch_utils.py
function move_data_to_device (line 7) | def move_data_to_device(x, device):
function do_mixup (line 18) | def do_mixup(x, mixup_lambda):
function append_to_dict (line 34) | def append_to_dict(dict, key, value):
function forward (line 41) | def forward(model, generator, return_input=False,
function interpolate (line 103) | def interpolate(x, ratio):
function pad_framewise_output (line 120) | def pad_framewise_output(framewise_output, frames_num):
function count_parameters (line 140) | def count_parameters(model):
function count_flops (line 144) | def count_flops(model, audio_length):
FILE: audio_detection/audio_infer/utils/crash.py
class ExceptionHook (line 3) | class ExceptionHook:
method __call__ (line 5) | def __call__(self, *args, **kwargs):
FILE: audio_detection/audio_infer/utils/create_black_list.py
function dcase2017task4 (line 8) | def dcase2017task4(args):
FILE: audio_detection/audio_infer/utils/create_indexes.py
function create_indexes (line 16) | def create_indexes(args):
function combine_full_indexes (line 40) | def combine_full_indexes(args):
FILE: audio_detection/audio_infer/utils/data_generator.py
function read_black_list (line 10) | def read_black_list(black_list_csv):
class AudioSetDataset (line 21) | class AudioSetDataset(object):
method __init__ (line 22) | def __init__(self, sample_rate=32000):
method __getitem__ (line 28) | def __getitem__(self, meta):
method resample (line 55) | def resample(self, waveform):
class Base (line 74) | class Base(object):
method __init__ (line 75) | def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, rand...
class TrainSampler (line 109) | class TrainSampler(Base):
method __init__ (line 110) | def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
method __iter__ (line 130) | def __iter__(self):
method state_dict (line 163) | def state_dict(self):
method load_state_dict (line 169) | def load_state_dict(self, state):
class BalancedTrainSampler (line 174) | class BalancedTrainSampler(Base):
method __init__ (line 175) | def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
method expand_queue (line 208) | def expand_queue(self, queue):
method __iter__ (line 214) | def __iter__(self):
method state_dict (line 252) | def state_dict(self):
method load_state_dict (line 259) | def load_state_dict(self, state):
class AlternateTrainSampler (line 265) | class AlternateTrainSampler(Base):
method __init__ (line 266) | def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
method __iter__ (line 286) | def __iter__(self):
method state_dict (line 348) | def state_dict(self):
method load_state_dict (line 354) | def load_state_dict(self, state):
class EvaluateSampler (line 359) | class EvaluateSampler(object):
method __init__ (line 360) | def __init__(self, indexes_hdf5_path, batch_size):
method __iter__ (line 377) | def __iter__(self):
function collate_fn (line 406) | def collate_fn(list_data_dict):
FILE: audio_detection/audio_infer/utils/dataset.py
function split_unbalanced_csv_to_partial_csvs (line 17) | def split_unbalanced_csv_to_partial_csvs(args):
function download_wavs (line 51) | def download_wavs(args):
function pack_waveforms_to_hdf5 (line 125) | def pack_waveforms_to_hdf5(args):
FILE: audio_detection/audio_infer/utils/plot_for_paper.py
function load_statistics (line 16) | def load_statistics(statistics_path):
function crop_label (line 27) | def crop_label(label):
function add_comma (line 42) | def add_comma(integer):
function plot_classwise_iteration_map (line 52) | def plot_classwise_iteration_map(args):
function plot_six_figures (line 92) | def plot_six_figures(args):
function plot_complexity_map (line 298) | def plot_complexity_map(args):
function plot_long_fig (line 345) | def plot_long_fig(args):
function prepare_plot_long_4_rows (line 436) | def prepare_plot_long_4_rows(sorted_lbs):
function _scatter_4_rows (line 523) | def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
function _plot_4_rows (line 531) | def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, labe...
FILE: audio_detection/audio_infer/utils/plot_statistics.py
function _load_metrics0 (line 17) | def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bin...
function _load_metrics0_classwise (line 40) | def _load_metrics0_classwise(filename, sample_rate, window_size, hop_siz...
function _load_metrics0_classwise2 (line 56) | def _load_metrics0_classwise2(filename, sample_rate, window_size, hop_si...
function _load_metrics_classwise (line 76) | def _load_metrics_classwise(filename, sample_rate, window_size, hop_size...
function plot (line 96) | def plot(args):
function plot_for_paper (line 705) | def plot_for_paper(args):
function plot_for_paper2 (line 965) | def plot_for_paper2(args):
function table_values (line 1260) | def table_values(args):
function crop_label (line 1410) | def crop_label(label):
function add_comma (line 1424) | def add_comma(integer):
function plot_class_iteration (line 1432) | def plot_class_iteration(args):
function _load_old_metrics (line 1490) | def _load_old_metrics(workspace, filename, iteration, data_type):
function _sort (line 1510) | def _sort(ys):
function load_data (line 1517) | def load_data(hdf5_path):
function get_avg_stats (line 1524) | def get_avg_stats(workspace, bgn_iter, fin_iter, interval_iter, filename...
function _samples_num_per_class (line 1576) | def _samples_num_per_class():
function get_label_quality (line 1593) | def get_label_quality():
function summary_stats (line 1614) | def summary_stats(args):
function prepare_plot_long_4_rows (line 1679) | def prepare_plot_long_4_rows(sorted_lbs):
function _scatter_4_rows (line 1767) | def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
function _plot_4_rows (line 1774) | def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, labe...
function plot_long_fig (line 1782) | def plot_long_fig(args):
function plot_flops (line 1852) | def plot_flops(args):
function spearman (line 1900) | def spearman(args):
function print_results (line 1924) | def print_results(args):
FILE: audio_detection/audio_infer/utils/utilities.py
function create_folder (line 13) | def create_folder(fd):
function get_filename (line 18) | def get_filename(path):
function get_sub_filepaths (line 25) | def get_sub_filepaths(folder):
function create_logging (line 34) | def create_logging(log_dir, filemode):
function read_metadata (line 59) | def read_metadata(csv_path, classes_num, id_to_ix):
function float32_to_int16 (line 95) | def float32_to_int16(x):
function int16_to_float32 (line 100) | def int16_to_float32(x):
function pad_or_truncate (line 104) | def pad_or_truncate(x, audio_length):
function d_prime (line 112) | def d_prime(auc):
class Mixup (line 117) | class Mixup(object):
method __init__ (line 118) | def __init__(self, mixup_alpha, random_seed=1234):
method get_lambda (line 124) | def get_lambda(self, batch_size):
class StatisticsContainer (line 140) | class StatisticsContainer(object):
method __init__ (line 141) | def __init__(self, statistics_path):
method append (line 152) | def append(self, iteration, statistics, data_type):
method dump (line 156) | def dump(self):
method load_state_dict (line 162) | def load_state_dict(self, resume_iteration):
FILE: audio_detection/target_sound_detection/src/models.py
function load_checkpoint (line 61) | def load_checkpoint(model,
function init_weights (line 124) | def init_weights(m):
function init_layer (line 137) | def init_layer(layer):
function init_bn (line 145) | def init_bn(bn):
class MaxPool (line 150) | class MaxPool(nn.Module):
method __init__ (line 151) | def __init__(self, pooldim=1):
method forward (line 155) | def forward(self, logits, decision):
class LinearSoftPool (line 159) | class LinearSoftPool(nn.Module):
method __init__ (line 167) | def __init__(self, pooldim=1):
method forward (line 171) | def forward(self, logits, time_decision):
class ConvBlock (line 175) | class ConvBlock(nn.Module):
method __init__ (line 176) | def __init__(self, in_channels, out_channels):
method init_weight (line 195) | def init_weight(self):
method forward (line 202) | def forward(self, input, pool_size=(2, 2), pool_type='avg'):
class ConvBlock_GLU (line 220) | class ConvBlock_GLU(nn.Module):
method __init__ (line 221) | def __init__(self, in_channels, out_channels,kernel_size=(3,3)):
method init_weight (line 231) | def init_weight(self):
method forward (line 235) | def forward(self, input, pool_size=(2, 2), pool_type='avg'):
class Mul_scale_GLU (line 258) | class Mul_scale_GLU(nn.Module):
method __init__ (line 259) | def __init__(self):
method forward (line 273) | def forward(self, input, fi=None):
class Cnn14 (line 304) | class Cnn14(nn.Module):
method __init__ (line 305) | def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, ...
method init_weight (line 345) | def init_weight(self):
method forward (line 349) | def forward(self, input_, mixup_lambda=None):
class Cnn10_fi (line 379) | class Cnn10_fi(nn.Module):
method __init__ (line 380) | def __init__(self):
method forward (line 392) | def forward(self, input, fi=None):
class Cnn10_mul_scale (line 422) | class Cnn10_mul_scale(nn.Module):
method __init__ (line 423) | def __init__(self,scale=8):
method forward (line 433) | def forward(self, input, pool_size=(2, 2), pool_type='avg'):
class Cnn10 (line 482) | class Cnn10(nn.Module):
method __init__ (line 483) | def __init__(self,scale=8):
method forward (line 490) | def forward(self, input, pool_size=(2, 2), pool_type='avg'):
class MeanPool (line 523) | class MeanPool(nn.Module):
method __init__ (line 524) | def __init__(self, pooldim=1):
method forward (line 528) | def forward(self, logits, decision):
class ResPool (line 531) | class ResPool(nn.Module):
method __init__ (line 532) | def __init__(self, pooldim=1):
class AutoExpPool (line 537) | class AutoExpPool(nn.Module):
method __init__ (line 538) | def __init__(self, outputdim=10, pooldim=1):
method forward (line 544) | def forward(self, logits, decision):
class SoftPool (line 550) | class SoftPool(nn.Module):
method __init__ (line 551) | def __init__(self, T=1, pooldim=1):
method forward (line 556) | def forward(self, logits, decision):
class AutoPool (line 561) | class AutoPool(nn.Module):
method __init__ (line 563) | def __init__(self, outputdim=10, pooldim=1):
method forward (line 569) | def forward(self, logits, decision):
class ExtAttentionPool (line 575) | class ExtAttentionPool(nn.Module):
method __init__ (line 576) | def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
method forward (line 586) | def forward(self, logits, decision):
class AttentionPool (line 594) | class AttentionPool(nn.Module):
method __init__ (line 596) | def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
method forward (line 605) | def forward(self, logits, decision):
class Block2D (line 614) | class Block2D(nn.Module):
method __init__ (line 615) | def __init__(self, cin, cout, kernel_size=3, padding=1):
method forward (line 626) | def forward(self, x):
class AudioCNN (line 629) | class AudioCNN(nn.Module):
method __init__ (line 630) | def __init__(self, classes_num):
method init_weights (line 640) | def init_weights(self):
method forward (line 643) | def forward(self, input):
method extract (line 660) | def extract(self,input):
function parse_poolingfunction (line 673) | def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
class conv1d (line 698) | class conv1d(nn.Module):
method __init__ (line 699) | def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID'...
method init_layer (line 711) | def init_layer(self, layer, nonlinearity='relu'):
method forward (line 716) | def forward(self, x):
class Atten_1 (line 720) | class Atten_1(nn.Module):
method __init__ (line 721) | def __init__(self, input_dim, context=2, dropout_rate=0.2):
method init_layer (line 731) | def init_layer(self, layer, nonlinearity='leaky_relu'):
method forward (line 738) | def forward(self, input_x):
class Fusion (line 770) | class Fusion(nn.Module):
method __init__ (line 771) | def __init__(self, inputdim, inputdim2, n_fac):
method forward (line 777) | def forward(self,embedding,mix_embed):
class CDur_fusion (line 790) | class CDur_fusion(nn.Module):
method __init__ (line 791) | def __init__(self, inputdim, outputdim, **kwargs):
method forward (line 815) | def forward(self, x, embedding): #
class CDur (line 836) | class CDur(nn.Module):
method __init__ (line 837) | def __init__(self, inputdim, outputdim,time_resolution, **kwargs):
method forward (line 860) | def forward(self, x, embedding,one_hot=None): #
class CDur_big (line 880) | class CDur_big(nn.Module):
method __init__ (line 881) | def __init__(self, inputdim, outputdim, **kwargs):
method forward (line 906) | def forward(self, x, embedding): #
class CDur_GLU (line 926) | class CDur_GLU(nn.Module):
method __init__ (line 927) | def __init__(self, inputdim, outputdim, **kwargs):
method forward (line 940) | def forward(self, x, embedding,one_hot=None): #
class CDur_CNN14 (line 964) | class CDur_CNN14(nn.Module):
method __init__ (line 965) | def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
method forward (line 987) | def forward(self, x, embedding,one_hot=None):
class CDur_CNN_mul_scale (line 1010) | class CDur_CNN_mul_scale(nn.Module):
method __init__ (line 1011) | def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
method forward (line 1033) | def forward(self, x, embedding,one_hot=None):
class CDur_CNN_mul_scale_fusion (line 1058) | class CDur_CNN_mul_scale_fusion(nn.Module):
method __init__ (line 1059) | def __init__(self, inputdim, outputdim, time_resolution,**kwargs):
method forward (line 1082) | def forward(self, x, embedding,one_hot=None):
class RaDur_fusion (line 1109) | class RaDur_fusion(nn.Module):
method __init__ (line 1110) | def __init__(self, model_config, inputdim, outputdim, time_resolution,...
method get_w (line 1132) | def get_w(self,q,k):
method get_w_ee (line 1141) | def get_w_ee(self,q,k):
method attention_pooling (line 1150) | def attention_pooling(self, embeddings, mean_embedding):
method select_topk_embeddings (line 1159) | def select_topk_embeddings(self, scores, embeddings, k):
method sum_with_attention (line 1170) | def sum_with_attention(self, embedding, top_k, selected_embeddings):
method orcal_EE (line 1188) | def orcal_EE(self, x, embedding, label):
method forward (line 1250) | def forward(self, x, ref, label=None):
FILE: audio_detection/target_sound_detection/src/utils.py
function parse_config_or_kwargs (line 23) | def parse_config_or_kwargs(config_file, **kwargs):
function find_contiguous_regions (line 34) | def find_contiguous_regions(activity_array): # in this part, if you cann...
function split_train_cv (line 55) | def split_train_cv(
function pprint_dict (line 95) | def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): ...
function getfile_outlogger (line 108) | def getfile_outlogger(outputfile):
function train_labelencoder (line 116) | def train_labelencoder(labels: pd.Series, sparse=True):
function encode_labels (line 139) | def encode_labels(labels: pd.Series, encoder=None, sparse=True):
function decode_with_timestamps (line 169) | def decode_with_timestamps(events,labels: np.array):
function median_filter (line 189) | def median_filter(x, window_size, threshold=0.5):
function _decode_with_timestamps (line 210) | def _decode_with_timestamps(events,labels):
function inverse_transform_labels (line 222) | def inverse_transform_labels(encoder, pred):
function binarize (line 229) | def binarize(pred, threshold=0.5):
function double_threshold (line 238) | def double_threshold(x, high_thres, low_thres, n_connect=1):
function _double_threshold (line 263) | def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=...
function connect_clusters (line 293) | def connect_clusters(x, n=1):
function connect_clusters_ (line 300) | def connect_clusters_(x, n=1):
function connect_ (line 316) | def connect_(pairs, n=1):
function predictions_to_time (line 338) | def predictions_to_time(df, ratio):
function upgrade_resolution (line 343) | def upgrade_resolution(arr, scale):
FILE: audio_to_text/captioning/models/base_model.py
class CaptionModel (line 11) | class CaptionModel(nn.Module):
method __init__ (line 21) | def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
method check_decoder_compatibility (line 34) | def check_decoder_compatibility(self):
method set_index (line 41) | def set_index(cls, start_idx, end_idx):
method forward (line 45) | def forward(self, input_dict: Dict):
method prepare_output (line 108) | def prepare_output(self, input_dict):
method train_forward (line 127) | def train_forward(self, input_dict):
method seq_forward (line 135) | def seq_forward(self, input_dict):
method train_process (line 138) | def train_process(self, output, input_dict):
method inference_forward (line 141) | def inference_forward(self, input_dict):
method stepwise_forward (line 148) | def stepwise_forward(self, input_dict):
method decode_step (line 168) | def decode_step(self, input_dict, output):
method prepare_decoder_input (line 194) | def prepare_decoder_input(self, input_dict, output):
method stepwise_process_step (line 198) | def stepwise_process_step(self, output, output_t):
method stepwise_process (line 206) | def stepwise_process(self, output):
method sample_next_word (line 210) | def sample_next_word(self, logit, method, temp):
method beam_search (line 250) | def beam_search(self, input_dict):
method prepare_beamsearch_output (line 320) | def prepare_beamsearch_output(self, input_dict):
method beamsearch_step (line 332) | def beamsearch_step(self, input_dict, output_i):
method prepare_beamsearch_decoder_input (line 338) | def prepare_beamsearch_decoder_input(self, input_dict, output_i):
method beamsearch_process_step (line 341) | def beamsearch_process_step(self, output_i, output_t):
method beamsearch_process (line 344) | def beamsearch_process(self, output, output_i, input_dict):
method diverse_beam_search (line 356) | def diverse_beam_search(self, input_dict):
method prepare_dbs_decoder_input (line 466) | def prepare_dbs_decoder_input(self, input_dict, output_i):
method dbs_process_step (line 469) | def dbs_process_step(self, output_i, output_t):
class CaptionSequenceModel (line 473) | class CaptionSequenceModel(nn.Module):
method __init__ (line 475) | def __init__(self, model, seq_output_size):
method forward (line 483) | def forward(self, input_dict):
FILE: audio_to_text/captioning/models/decoder.py
class BaseDecoder (line 13) | class BaseDecoder(nn.Module):
method __init__ (line 20) | def __init__(self, emb_dim, vocab_size, fc_emb_dim,
method forward (line 30) | def forward(self, x):
method load_word_embedding (line 33) | def load_word_embedding(self, weight, freeze=True):
class RnnDecoder (line 46) | class RnnDecoder(BaseDecoder):
method __init__ (line 48) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 59) | def forward(self, x):
method init_hidden (line 62) | def init_hidden(self, bs, device):
class RnnFcDecoder (line 73) | class RnnFcDecoder(RnnDecoder):
method __init__ (line 75) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, drop...
method forward (line 86) | def forward(self, input_dict):
class Seq2SeqAttention (line 110) | class Seq2SeqAttention(nn.Module):
method __init__ (line 112) | def __init__(self, hs_enc, hs_dec, attn_size):
method forward (line 124) | def forward(self, h_dec, h_enc, src_lens):
class AttentionProj (line 151) | class AttentionProj(nn.Module):
method __init__ (line 153) | def __init__(self, hs_enc, hs_dec, embed_dim, attn_size):
method init (line 160) | def init(self, m):
method forward (line 166) | def forward(self, h_dec, h_enc, src_lens):
class BahAttnDecoder (line 195) | class BahAttnDecoder(RnnDecoder):
method __init__ (line 197) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 219) | def forward(self, input_dict):
class BahAttnDecoder2 (line 254) | class BahAttnDecoder2(RnnDecoder):
method __init__ (line 256) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 278) | def forward(self, input_dict):
class ConditionalBahAttnDecoder (line 312) | class ConditionalBahAttnDecoder(RnnDecoder):
method __init__ (line 314) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 336) | def forward(self, input_dict):
class StructBahAttnDecoder (line 375) | class StructBahAttnDecoder(RnnDecoder):
method __init__ (line 377) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, struct_vocab_size,
method forward (line 399) | def forward(self, input_dict):
class StyleBahAttnDecoder (line 436) | class StyleBahAttnDecoder(RnnDecoder):
method __init__ (line 438) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 459) | def forward(self, input_dict):
class BahAttnDecoder3 (line 494) | class BahAttnDecoder3(RnnDecoder):
method __init__ (line 496) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 517) | def forward(self, input_dict):
class SpecificityBahAttnDecoder (line 555) | class SpecificityBahAttnDecoder(RnnDecoder):
method __init__ (line 557) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 578) | def forward(self, input_dict):
class TransformerDecoder (line 614) | class TransformerDecoder(BaseDecoder):
method __init__ (line 616) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, drop...
method init_params (line 640) | def init_params(self):
method generate_square_subsequent_mask (line 645) | def generate_square_subsequent_mask(self, max_length):
method forward (line 650) | def forward(self, input_dict):
class EventTransformerDecoder (line 678) | class EventTransformerDecoder(TransformerDecoder):
method forward (line 680) | def forward(self, input_dict):
class KeywordProbTransformerDecoder (line 709) | class KeywordProbTransformerDecoder(TransformerDecoder):
method __init__ (line 711) | def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
method forward (line 718) | def forward(self, input_dict):
FILE: audio_to_text/captioning/models/encoder.py
function init_layer (line 16) | def init_layer(layer):
function init_bn (line 25) | def init_bn(bn):
class BaseEncoder (line 31) | class BaseEncoder(nn.Module):
method __init__ (line 39) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
method forward (line 46) | def forward(self, x):
class Block2D (line 58) | class Block2D(nn.Module):
method __init__ (line 60) | def __init__(self, cin, cout, kernel_size=3, padding=1):
method forward (line 71) | def forward(self, x):
class LinearSoftPool (line 75) | class LinearSoftPool(nn.Module):
method __init__ (line 82) | def __init__(self, pooldim=1):
method forward (line 86) | def forward(self, logits, time_decision):
class MeanPool (line 91) | class MeanPool(nn.Module):
method __init__ (line 93) | def __init__(self, pooldim=1):
method forward (line 97) | def forward(self, logits, decision):
class AttentionPool (line 101) | class AttentionPool(nn.Module):
method __init__ (line 103) | def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
method forward (line 112) | def forward(self, logits, decision):
class MMPool (line 122) | class MMPool(nn.Module):
method __init__ (line 124) | def __init__(self, dims):
method forward (line 129) | def forward(self, x):
function parse_poolingfunction (line 133) | def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
function embedding_pooling (line 150) | def embedding_pooling(x, lens, pooling="mean"):
class Cdur5Encoder (line 168) | class Cdur5Encoder(BaseEncoder):
method __init__ (line 170) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
method forward (line 195) | def forward(self, input_dict):
function conv_conv_block (line 223) | def conv_conv_block(in_channel, out_channel):
class Cdur8Encoder (line 242) | class Cdur8Encoder(BaseEncoder):
method __init__ (line 244) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, pooling="mean"):
method forward (line 267) | def forward(self, input_dict):
class Cnn10Encoder (line 290) | class Cnn10Encoder(BaseEncoder):
method __init__ (line 292) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
method forward (line 313) | def forward(self, input_dict):
class ConvBlock (line 336) | class ConvBlock(nn.Module):
method __init__ (line 337) | def __init__(self, in_channels, out_channels):
method init_weight (line 356) | def init_weight(self):
method forward (line 363) | def forward(self, input, pool_size=(2, 2), pool_type='avg'):
class Cnn14Encoder (line 382) | class Cnn14Encoder(nn.Module):
method __init__ (line 383) | def __init__(self, sample_rate=32000):
method init_weight (line 422) | def init_weight(self):
method load_pretrained (line 426) | def load_pretrained(self, pretrained):
method forward (line 464) | def forward(self, input_dict):
class RnnEncoder (line 519) | class RnnEncoder(BaseEncoder):
method __init__ (line 521) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim,
method forward (line 543) | def forward(self, input_dict):
class Cnn14RnnEncoder (line 561) | class Cnn14RnnEncoder(nn.Module):
method __init__ (line 562) | def __init__(self, sample_rate=32000, pretrained=None,
method train (line 576) | def train(self, mode):
method forward (line 586) | def forward(self, input_dict):
class TransformerEncoder (line 595) | class TransformerEncoder(BaseEncoder):
method __init__ (line 597) | def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim, d_model, **kw...
method init_params (line 619) | def init_params(self):
method forward (line 624) | def forward(self, input_dict):
class Cnn14TransformerEncoder (line 650) | class Cnn14TransformerEncoder(nn.Module):
method __init__ (line 651) | def __init__(self, sample_rate=32000, pretrained=None,
method train (line 665) | def train(self, mode):
method forward (line 675) | def forward(self, input_dict):
FILE: audio_to_text/captioning/models/transformer_model.py
class TransformerModel (line 11) | class TransformerModel(CaptionModel):
method __init__ (line 13) | def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
method seq_forward (line 20) | def seq_forward(self, input_dict):
method prepare_decoder_input (line 34) | def prepare_decoder_input(self, input_dict, output):
method prepare_beamsearch_decoder_input (line 59) | def prepare_beamsearch_decoder_input(self, input_dict, output_i):
class M2TransformerModel (line 89) | class M2TransformerModel(CaptionModel):
method __init__ (line 91) | def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
method check_encoder_compatibility (line 99) | def check_encoder_compatibility(self):
method seq_forward (line 104) | def seq_forward(self, input_dict):
method prepare_decoder_input (line 115) | def prepare_decoder_input(self, input_dict, output):
method prepare_beamsearch_decoder_input (line 138) | def prepare_beamsearch_decoder_input(self, input_dict, output_i):
class EventEncoder (line 166) | class EventEncoder(nn.Module):
method __init__ (line 170) | def __init__(self, emb_dim, vocab_size=527):
method forward (line 175) | def forward(self, word_idxs):
class EventCondTransformerModel (line 181) | class EventCondTransformerModel(TransformerModel):
method __init__ (line 183) | def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
method prepare_decoder_input (line 207) | def prepare_decoder_input(self, input_dict, output):
method prepare_beamsearch_decoder_input (line 212) | def prepare_beamsearch_decoder_input(self, input_dict, output_i):
class KeywordCondTransformerModel (line 223) | class KeywordCondTransformerModel(TransformerModel):
method __init__ (line 225) | def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
method seq_forward (line 234) | def seq_forward(self, input_dict):
method prepare_decoder_input (line 250) | def prepare_decoder_input(self, input_dict, output):
method prepare_beamsearch_decoder_input (line 255) | def prepare_beamsearch_decoder_input(self, input_dict, output_i):
FILE: audio_to_text/captioning/models/utils.py
function sort_pack_padded_sequence (line 10) | def sort_pack_padded_sequence(input, lengths):
function pad_unsort_packed_sequence (line 17) | def pad_unsort_packed_sequence(input, inv_ix):
function pack_wrapper (line 22) | def pack_wrapper(module, attn_feats, attn_feat_lens):
function generate_length_mask (line 29) | def generate_length_mask(lens, max_length=None):
function mean_with_lens (line 39) | def mean_with_lens(features, lens):
function max_with_lens (line 63) | def max_with_lens(features, lens):
function repeat_tensor (line 76) | def repeat_tensor(x, n):
function init (line 79) | def init(m, method="kaiming"):
class PositionalEncoding (line 113) | class PositionalEncoding(nn.Module):
method __init__ (line 115) | def __init__(self, d_model, dropout=0.1, max_len=100):
method forward (line 129) | def forward(self, x):
FILE: audio_to_text/captioning/utils/bert/create_sent_embedding.py
class EmbeddingExtractor (line 8) | class EmbeddingExtractor(object):
method extract_sentbert (line 10) | def extract_sentbert(self, caption_file: str, output: str, dev: bool=T...
method extract_originbert (line 21) | def extract_originbert(self, caption_file: str, output: str, dev: bool...
method extract (line 27) | def extract(self, caption_file: str, model, output, dev: bool):
method extract_sbert (line 66) | def extract_sbert(self,
FILE: audio_to_text/captioning/utils/bert/create_word_embedding.py
function main (line 15) | def main(vocab_file: str, output: str, server_hostname: str):
FILE: audio_to_text/captioning/utils/build_vocab.py
class Vocabulary (line 10) | class Vocabulary(object):
method __init__ (line 12) | def __init__(self):
method add_word (line 17) | def add_word(self, word):
method __call__ (line 23) | def __call__(self, word):
method __getitem__ (line 28) | def __getitem__(self, word_id):
method __len__ (line 31) | def __len__(self):
function build_vocab (line 35) | def build_vocab(input_json: str,
function process (line 134) | def process(input_json: str,
FILE: audio_to_text/captioning/utils/build_vocab_ltp.py
class Vocabulary (line 9) | class Vocabulary(object):
method __init__ (line 11) | def __init__(self):
method add_word (line 16) | def add_word(self, word):
method __call__ (line 22) | def __call__(self, word):
method __len__ (line 27) | def __len__(self):
function build_vocab (line 30) | def build_vocab(input_json: str,
function process (line 131) | def process(input_json: str,
FILE: audio_to_text/captioning/utils/build_vocab_spacy.py
class Vocabulary (line 9) | class Vocabulary(object):
method __init__ (line 11) | def __init__(self):
method add_word (line 16) | def add_word(self, word):
method __call__ (line 22) | def __call__(self, word):
method __len__ (line 27) | def __len__(self):
function build_vocab (line 31) | def build_vocab(input_json: str,
function process (line 130) | def process(input_json: str,
FILE: audio_to_text/captioning/utils/eval_round_robin.py
function evaluate_annotation (line 8) | def evaluate_annotation(key2refs, scorer):
function evaluate_prediction (line 30) | def evaluate_prediction(key2pred, key2refs, scorer):
class Evaluator (line 52) | class Evaluator(object):
method eval_annotation (line 54) | def eval_annotation(self, annotation, output):
method eval_prediction (line 108) | def eval_prediction(self, prediction, annotation, output):
FILE: audio_to_text/captioning/utils/fasttext/create_word_embedding.py
function create_embedding (line 16) | def create_embedding(caption_file: str,
FILE: audio_to_text/captioning/utils/lr_scheduler.py
class ExponentialDecayScheduler (line 5) | class ExponentialDecayScheduler(torch.optim.lr_scheduler._LRScheduler):
method __init__ (line 7) | def __init__(self, optimizer, total_iters, final_lrs,
method _get_closed_form_lr (line 22) | def _get_closed_form_lr(self):
method get_lr (line 44) | def get_lr(self):
class NoamScheduler (line 48) | class NoamScheduler(torch.optim.lr_scheduler._LRScheduler):
method __init__ (line 50) | def __init__(self, optimizer, model_size=512, factor=1, warmup_iters=3...
method _get_closed_form_lr (line 58) | def _get_closed_form_lr(self):
method get_lr (line 68) | def get_lr(self):
class CosineWithWarmup (line 72) | class CosineWithWarmup(torch.optim.lr_scheduler._LRScheduler):
method __init__ (line 74) | def __init__(self, optimizer, total_iters, warmup_iters,
method lr_lambda (line 81) | def lr_lambda(self, iteration):
method _get_closed_form_lr (line 89) | def _get_closed_form_lr(self):
method get_lr (line 97) | def get_lr(self):
FILE: audio_to_text/captioning/utils/model_eval_diff.py
function coco_score (line 13) | def coco_score(refs, pred, scorer):
function embedding_score (line 40) | def embedding_score(refs, pred, scorer):
function main (line 58) | def main(output_file, eval_caption_file, eval_embedding_file, output, zh...
FILE: audio_to_text/captioning/utils/remove_optimizer.py
function main (line 5) | def main(checkpoint):
FILE: audio_to_text/captioning/utils/tokenize_caption.py
function tokenize_caption (line 7) | def tokenize_caption(input_json: str,
FILE: audio_to_text/captioning/utils/train_util.py
function load_dict_from_csv (line 15) | def load_dict_from_csv(csv, cols):
function init_logger (line 21) | def init_logger(filename, level="INFO"):
function init_obj (line 37) | def init_obj(module, config, **kwargs):# 'captioning.models.encoder'
function pprint_dict (line 43) | def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'):
function merge_a_into_b (line 57) | def merge_a_into_b(a, b):
function load_config (line 69) | def load_config(config_file):
function parse_config_or_kwargs (line 86) | def parse_config_or_kwargs(config_file, **kwargs):
function store_yaml (line 93) | def store_yaml(config, config_file):
class MetricImprover (line 98) | class MetricImprover:
method __init__ (line 100) | def __init__(self, mode):
method compare (line 106) | def compare(self, x, best_x):
method __call__ (line 109) | def __call__(self, x):
method state_dict (line 115) | def state_dict(self):
method load_state_dict (line 118) | def load_state_dict(self, state_dict):
function fix_batchnorm (line 122) | def fix_batchnorm(model: torch.nn.Module):
function load_pretrained_model (line 130) | def load_pretrained_model(model: torch.nn.Module,
class AveragedModel (line 158) | class AveragedModel(torch_average_model):
method update_parameters (line 160) | def update_parameters(self, model):
FILE: audio_to_text/captioning/utils/word2vec/create_word_embedding.py
function create_embedding (line 17) | def create_embedding(vocab_file: str,
FILE: audio_to_text/inference_waveform.py
function load_model (line 12) | def load_model(config, checkpoint):
function decode_caption (line 48) | def decode_caption(word_ids, vocabulary):
class AudioCapModel (line 61) | class AudioCapModel(object):
method __init__ (line 62) | def __init__(self,weight_dir,device='cuda'):
method caption (line 72) | def caption(self,audio_list):
method __call__ (line 98) | def __call__(self, audio_list):
FILE: mono2binaural/src/models.py
class GeometricWarper (line 11) | class GeometricWarper(nn.Module):
method __init__ (line 12) | def __init__(self, sampling_rate=48000):
method _transmitter_mouth (line 16) | def _transmitter_mouth(self, view):
method _3d_displacements (line 31) | def _3d_displacements(self, view):
method _warpfield (line 42) | def _warpfield(self, view, seq_length):
method forward (line 45) | def forward(self, mono, view):
class Warpnet (line 54) | class Warpnet(nn.Module):
method __init__ (line 55) | def __init__(self, layers=4, channels=64, view_dim=7):
method neural_warpfield (line 63) | def neural_warpfield(self, view, seq_length):
method forward (line 72) | def forward(self, mono, view):
class BinauralNetwork (line 86) | class BinauralNetwork(Net):
method __init__ (line 87) | def __init__(self,
method forward (line 98) | def forward(self, mono, view):
FILE: mono2binaural/src/utils.py
class Net (line 14) | class Net(th.nn.Module):
method __init__ (line 16) | def __init__(self, model_name="network", use_cuda=True):
method save (line 21) | def save(self, model_dir, suffix=''):
method load_from_file (line 39) | def load_from_file(self, model_file):
method load (line 54) | def load(self, model_dir, suffix=''):
method num_trainable_parameters (line 66) | def num_trainable_parameters(self):
FILE: mono2binaural/src/warping.py
class TimeWarperFunction (line 14) | class TimeWarperFunction(th.autograd.Function):
method forward (line 17) | def forward(ctx, input, warpfield):
method backward (line 35) | def backward(ctx, grad_output):
class TimeWarper (line 51) | class TimeWarper(nn.Module):
method __init__ (line 53) | def __init__(self):
method _to_absolute_positions (line 57) | def _to_absolute_positions(self, warpfield, seq_length):
method forward (line 63) | def forward(self, input, warpfield):
class MonotoneTimeWarper (line 74) | class MonotoneTimeWarper(TimeWarper):
method forward (line 76) | def forward(self, input, warpfield):
class GeometricTimeWarper (line 91) | class GeometricTimeWarper(TimeWarper):
method __init__ (line 93) | def __init__(self, sampling_rate=48000):
method displacements2warpfield (line 97) | def displacements2warpfield(self, displacements, seq_length):
method forward (line 103) | def forward(self, input, displacements):
FILE: sound_extraction/model/LASSNet.py
class LASSNet (line 7) | class LASSNet(nn.Module):
method __init__ (line 8) | def __init__(self, device='cuda'):
method forward (line 13) | def forward(self, x, caption):
method get_tokenizer (line 24) | def get_tokenizer(self):
FILE: sound_extraction/model/film.py
class Film (line 4) | class Film(nn.Module):
method __init__ (line 5) | def __init__(self, channels, cond_embedding_dim):
method forward (line 14) | def forward(self, data, cond_vec):
FILE: sound_extraction/model/modules.py
class ConvBlock (line 7) | class ConvBlock(nn.Module):
method __init__ (line 8) | def __init__(self, in_channels, out_channels, kernel_size, activation,...
method init_weights (line 40) | def init_weights(self):
method forward (line 46) | def forward(self, x):
class EncoderBlock (line 52) | class EncoderBlock(nn.Module):
method __init__ (line 53) | def __init__(self, in_channels, out_channels, kernel_size, downsample,...
method forward (line 61) | def forward(self, x):
class DecoderBlock (line 67) | class DecoderBlock(nn.Module):
method __init__ (line 68) | def __init__(self, in_channels, out_channels, kernel_size, upsample, a...
method init_weights (line 90) | def init_weights(self):
method prune (line 94) | def prune(self, x):
method forward (line 104) | def forward(self, input_tensor, concat_tensor):
class EncoderBlockRes1B (line 113) | class EncoderBlockRes1B(nn.Module):
method __init__ (line 114) | def __init__(self, in_channels, out_channels, downsample, activation, ...
method forward (line 124) | def forward(self, x):
class DecoderBlockRes1B (line 132) | class DecoderBlockRes1B(nn.Module):
method __init__ (line 133) | def __init__(self, in_channels, out_channels, stride, activation, mome...
method init_weights (line 148) | def init_weights(self):
method prune (line 151) | def prune(self, x, both=False):
method forward (line 158) | def forward(self, input_tensor, concat_tensor,both=False):
class EncoderBlockRes2BCond (line 169) | class EncoderBlockRes2BCond(nn.Module):
method __init__ (line 170) | def __init__(self, in_channels, out_channels, downsample, activation, ...
method forward (line 178) | def forward(self, x, cond_vec):
class DecoderBlockRes2BCond (line 184) | class DecoderBlockRes2BCond(nn.Module):
method __init__ (line 185) | def __init__(self, in_channels, out_channels, stride, activation, mome...
method init_weights (line 198) | def init_weights(self):
method prune (line 201) | def prune(self, x, both=False):
method forward (line 208) | def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
class EncoderBlockRes4BCond (line 216) | class EncoderBlockRes4BCond(nn.Module):
method __init__ (line 217) | def __init__(self, in_channels, out_channels, downsample, activation, ...
method forward (line 227) | def forward(self, x, cond_vec):
class DecoderBlockRes4BCond (line 235) | class DecoderBlockRes4BCond(nn.Module):
method __init__ (line 236) | def __init__(self, in_channels, out_channels, stride, activation, mome...
method init_weights (line 251) | def init_weights(self):
method prune (line 254) | def prune(self, x, both=False):
method forward (line 261) | def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
class EncoderBlockRes4B (line 271) | class EncoderBlockRes4B(nn.Module):
method __init__ (line 272) | def __init__(self, in_channels, out_channels, downsample, activation, ...
method forward (line 282) | def forward(self, x):
class DecoderBlockRes4B (line 290) | class DecoderBlockRes4B(nn.Module):
method __init__ (line 291) | def __init__(self, in_channels, out_channels, stride, activation, mome...
method init_weights (line 306) | def init_weights(self):
method prune (line 309) | def prune(self, x, both=False):
method forward (line 316) | def forward(self, input_tensor, concat_tensor,both=False):
class ConvBlockResCond (line 326) | class ConvBlockResCond(nn.Module):
method __init__ (line 327) | def __init__(self, in_channels, out_channels, kernel_size, activation,...
method init_weights (line 359) | def init_weights(self):
method forward (line 368) | def forward(self, x, cond_vec):
class ConvBlockRes (line 381) | class ConvBlockRes(nn.Module):
method __init__ (line 382) | def __init__(self, in_channels, out_channels, kernel_size, activation,...
method init_weights (line 412) | def init_weights(self):
method forward (line 421) | def forward(self, x):
function init_layer (line 431) | def init_layer(layer):
function init_bn (line 439) | def init_bn(bn):
function init_gru (line 444) | def init_gru(rnn):
function act (line 472) | def act(x, activation):
FILE: sound_extraction/model/resunet_film.py
class UNetRes_FiLM (line 4) | class UNetRes_FiLM(nn.Module):
method __init__ (line 5) | def __init__(self, channels, cond_embedding_dim, nsrc=1):
method init_weights (line 63) | def init_weights(self):
method forward (line 66) | def forward(self, sp, cond_vec, dec_cond_vec):
FILE: sound_extraction/model/text_encoder.py
class Text_Encoder (line 11) | class Text_Encoder(nn.Module):
method __init__ (line 12) | def __init__(self, device):
method tokenize (line 29) | def tokenize(self, caption):
method forward (line 39) | def forward(self, input_ids, attns_mask):
FILE: sound_extraction/utils/create_mixtures.py
function add_noise_and_scale (line 4) | def add_noise_and_scale(front, noise, snr_l=0, snr_h=0, scale_lower=1.0,...
function _random_scale (line 34) | def _random_scale(lower=0.3, upper=0.9):
function _random_noise (line 37) | def _random_noise(clean, noise, snr_l=None, snr_h=None):
function _to_numpy (line 42) | def _to_numpy(wav):
function normalize_energy (line 45) | def normalize_energy(audio, alpha = 1):
function normalize_energy_torch (line 54) | def normalize_energy_torch(audio, alpha = 1):
function unify_energy (line 64) | def unify_energy(*args):
function unify_energy_torch (line 69) | def unify_energy_torch(*args):
function activelev (line 74) | def activelev(*args):
function activelev_torch (line 80) | def activelev_torch(*args):
function uniform_torch (line 90) | def uniform_torch(lower, upper):
FILE: sound_extraction/utils/stft.py
function window_sumsquare (line 10) | def window_sumsquare(window, n_frames, hop_length=512, win_length=1024,
class STFT (line 53) | class STFT(torch.nn.Module):
method __init__ (line 55) | def __init__(self, filter_length=1024, hop_length=512, win_length=1024,
method transform (line 88) | def transform(self, input_data):
method inverse (line 118) | def inverse(self, magnitude, phase):
method forward (line 149) | def forward(self, input_data):
FILE: sound_extraction/utils/wav_io.py
function load_wav (line 7) | def load_wav(path):
function save_wav (line 21) | def save_wav(wav, path):
FILE: text_to_audio/Make_An_Audio/ldm/data/extract_mel_spectrogram.py
class MelSpectrogram (line 15) | class MelSpectrogram(object):
method __init__ (line 16) | def __init__(self, sr, nfft, fmin, fmax, nmels, hoplen, spec_power, in...
method __call__ (line 28) | def __call__(self, x):
class LowerThresh (line 40) | class LowerThresh(object):
method __init__ (line 41) | def __init__(self, min_val, inverse=False):
method __call__ (line 45) | def __call__(self, x):
class Add (line 51) | class Add(object):
method __init__ (line 52) | def __init__(self, val, inverse=False):
method __call__ (line 56) | def __call__(self, x):
class Subtract (line 62) | class Subtract(Add):
method __init__ (line 63) | def __init__(self, val, inverse=False):
method __call__ (line 67) | def __call__(self, x):
class Multiply (line 73) | class Multiply(object):
method __init__ (line 74) | def __init__(self, val, inverse=False) -> None:
method __call__ (line 78) | def __call__(self, x):
class Divide (line 84) | class Divide(Multiply):
method __init__ (line 85) | def __init__(self, val, inverse=False):
method __call__ (line 89) | def __call__(self, x):
class Log10 (line 95) | class Log10(object):
method __init__ (line 96) | def __init__(self, inverse=False):
method __call__ (line 99) | def __call__(self, x):
class Clip (line 105) | class Clip(object):
method __init__ (line 106) | def __init__(self, min_val, max_val, inverse=False):
method __call__ (line 111) | def __call__(self, x):
class TrimSpec (line 117) | class TrimSpec(object):
method __init__ (line 118) | def __init__(self, max_len, inverse=False):
method __call__ (line 122) | def __call__(self, x):
class MaxNorm (line 128) | class MaxNorm(object):
method __init__ (line 129) | def __init__(self, inverse=False):
method __call__ (line 133) | def __call__(self, x):
FILE: text_to_audio/Make_An_Audio/ldm/lr_scheduler.py
class LambdaWarmUpCosineScheduler (line 4) | class LambdaWarmUpCosineScheduler:
method __init__ (line 8) | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_...
method schedule (line 17) | def schedule(self, n, **kwargs):
method __call__ (line 32) | def __call__(self, n, **kwargs):
class LambdaWarmUpCosineScheduler2 (line 36) | class LambdaWarmUpCosineScheduler2:
method __init__ (line 41) | def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths...
method find_in_interval (line 52) | def find_in_interval(self, n):
method schedule (line 59) | def schedule(self, n, **kwargs):
method __call__ (line 77) | def __call__(self, n, **kwargs):
class LambdaLinearScheduler (line 81) | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
method schedule (line 83) | def schedule(self, n, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/autoencoder.py
class VQModel (line 14) | class VQModel(pl.LightningModule):
method __init__ (line 15) | def __init__(self,
method ema_scope (line 64) | def ema_scope(self, context=None):
method init_from_ckpt (line 78) | def init_from_ckpt(self, path, ignore_keys=list()):
method on_train_batch_end (line 92) | def on_train_batch_end(self, *args, **kwargs):
method encode (line 96) | def encode(self, x):
method encode_to_prequant (line 102) | def encode_to_prequant(self, x):
method decode (line 107) | def decode(self, quant):
method decode_code (line 112) | def decode_code(self, code_b):
method forward (line 117) | def forward(self, input, return_pred_indices=False):
method get_input (line 124) | def get_input(self, batch, k):
method training_step (line 142) | def training_step(self, batch, batch_idx, optimizer_idx):
method validation_step (line 164) | def validation_step(self, batch, batch_idx):
method _validation_step (line 170) | def _validation_step(self, batch, batch_idx, suffix=""):
method test_step (line 197) | def test_step(self, batch, batch_idx):
method configure_optimizers (line 217) | def configure_optimizers(self):
method get_last_layer (line 250) | def get_last_layer(self):
method log_images (line 253) | def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
method to_rgb (line 275) | def to_rgb(self, x):
class VQModelInterface (line 284) | class VQModelInterface(VQModel):
method __init__ (line 285) | def __init__(self, embed_dim, *args, **kwargs):
method encode (line 289) | def encode(self, x):# VQModel的quantize写在encoder里,VQModelInterface则将其写在...
method decode (line 294) | def decode(self, h, force_not_quantize=False):
class AutoencoderKL (line 305) | class AutoencoderKL(pl.LightningModule):
method __init__ (line 306) | def __init__(self,
method init_from_ckpt (line 334) | def init_from_ckpt(self, path, ignore_keys=list()):
method encode (line 345) | def encode(self, x):
method decode (line 351) | def decode(self, z):
method forward (line 356) | def forward(self, input, sample_posterior=True):
method get_input (line 365) | def get_input(self, batch, k):
method training_step (line 372) | def training_step(self, batch, batch_idx, optimizer_idx):
method validation_step (line 393) | def validation_step(self, batch, batch_idx):
method test_step (line 397) | def test_step(self, batch, batch_idx):
method configure_optimizers (line 417) | def configure_optimizers(self):
method get_last_layer (line 428) | def get_last_layer(self):
method log_images (line 432) | def log_images(self, batch, only_inputs=False,save_dir = 'mel_result_a...
method to_rgb (line 448) | def to_rgb(self, x):
class IdentityFirstStage (line 457) | class IdentityFirstStage(torch.nn.Module):
method __init__ (line 458) | def __init__(self, *args, vq_interface=False, **kwargs):
method encode (line 462) | def encode(self, x, *args, **kwargs):
method decode (line 465) | def decode(self, x, *args, **kwargs):
method quantize (line 468) | def quantize(self, x, *args, **kwargs):
method forward (line 473) | def forward(self, x, *args, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/autoencoder_multi.py
class AutoencoderKL (line 23) | class AutoencoderKL(pl.LightningModule):
method __init__ (line 24) | def __init__(self,
method init_from_ckpt (line 51) | def init_from_ckpt(self, path, ignore_keys=list()):
method encode (line 62) | def encode(self, x):
method decode (line 68) | def decode(self, z):
method forward (line 73) | def forward(self, input, sample_posterior=True):
method get_input (line 82) | def get_input(self, batch, k):
method training_step (line 89) | def training_step(self, batch, batch_idx, optimizer_idx):
method validation_step (line 110) | def validation_step(self, batch, batch_idx):
method test_step (line 124) | def test_step(self, batch, batch_idx):
method configure_optimizers (line 144) | def configure_optimizers(self):
method get_last_layer (line 155) | def get_last_layer(self):
method log_images (line 159) | def log_images(self, batch, only_inputs=False, **kwargs):
method to_rgb (line 175) | def to_rgb(self, x):
class IdentityFirstStage (line 184) | class IdentityFirstStage(torch.nn.Module):
method __init__ (line 185) | def __init__(self, *args, vq_interface=False, **kwargs):
method encode (line 189) | def encode(self, x, *args, **kwargs):
method decode (line 192) | def decode(self, x, *args, **kwargs):
method quantize (line 195) | def quantize(self, x, *args, **kwargs):
method forward (line 200) | def forward(self, x, *args, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/classifier.py
function disabled_train (line 22) | def disabled_train(self, mode=True):
class NoisyLatentImageClassifier (line 28) | class NoisyLatentImageClassifier(pl.LightningModule):
method __init__ (line 30) | def __init__(self,
method init_from_ckpt (line 70) | def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
method load_diffusion (line 88) | def load_diffusion(self):
method load_classifier (line 95) | def load_classifier(self, ckpt_path, pool):
method get_x_noisy (line 110) | def get_x_noisy(self, x, t, noise=None):
method forward (line 120) | def forward(self, x_noisy, t, *args, **kwargs):
method get_input (line 124) | def get_input(self, batch, k):
method get_conditioning (line 133) | def get_conditioning(self, batch, k=None):
method compute_top_k (line 150) | def compute_top_k(self, logits, labels, k, reduction="mean"):
method on_train_epoch_start (line 157) | def on_train_epoch_start(self):
method write_logs (line 162) | def write_logs(self, loss, logits, targets):
method shared_step (line 179) | def shared_step(self, batch, t=None):
method training_step (line 198) | def training_step(self, batch, batch_idx):
method reset_noise_accs (line 202) | def reset_noise_accs(self):
method on_validation_start (line 206) | def on_validation_start(self):
method validation_step (line 210) | def validation_step(self, batch, batch_idx):
method configure_optimizers (line 220) | def configure_optimizers(self):
method log_images (line 238) | def log_images(self, batch, N=8, *args, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/ddim.py
class DDIMSampler (line 12) | class DDIMSampler(object):
method __init__ (line 13) | def __init__(self, model, schedule="linear", **kwargs):
method register_buffer (line 20) | def register_buffer(self, name, attr):
method make_schedule (line 27) | def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddi...
method sample (line 59) | def sample(self,
method ddim_sampling (line 118) | def ddim_sampling(self, cond, shape,
method p_sample_ddim (line 169) | def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_origin...
method stochastic_encode (line 228) | def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
method decode (line 244) | def decode(self, x_latent, cond, t_start, unconditional_guidance_scale...
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/ddpm.py
function disabled_train (line 33) | def disabled_train(self, mode=True):
function uniform_on_device (line 39) | def uniform_on_device(r1, r2, shape, device):
class DDPM (line 43) | class DDPM(pl.LightningModule):
method __init__ (line 45) | def __init__(self,
method register_schedule (line 115) | def register_schedule(self, given_betas=None, beta_schedule="linear", ...
method ema_scope (line 170) | def ema_scope(self, context=None):
method init_from_ckpt (line 184) | def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
method q_mean_variance (line 202) | def q_mean_variance(self, x_start, t):
method predict_start_from_noise (line 214) | def predict_start_from_noise(self, x_t, t, noise):
method q_posterior (line 220) | def q_posterior(self, x_start, x_t, t):
method p_mean_variance (line 229) | def p_mean_variance(self, x, t, clip_denoised: bool):
method p_sample (line 242) | def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
method p_sample_loop (line 251) | def p_sample_loop(self, shape, return_intermediates=False):
method sample (line 266) | def sample(self, batch_size=16, return_intermediates=False):
method q_sample (line 272) | def q_sample(self, x_start, t, noise=None):
method get_loss (line 277) | def get_loss(self, pred, target, mean=True):
method p_losses (line 292) | def p_losses(self, x_start, t, noise=None):
method forward (line 321) | def forward(self, x, *args, **kwargs):
method get_input (line 327) | def get_input(self, batch, k):
method shared_step (line 335) | def shared_step(self, batch):
method training_step (line 340) | def training_step(self, batch, batch_idx):
method validation_step (line 356) | def validation_step(self, batch, batch_idx):
method on_train_batch_end (line 364) | def on_train_batch_end(self, *args, **kwargs):
method _get_rows_from_list (line 368) | def _get_rows_from_list(self, samples):
method log_images (line 376) | def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=Non...
method configure_optimizers (line 413) | def configure_optimizers(self):
class LatentDiffusion (line 422) | class LatentDiffusion(DDPM):
method __init__ (line 424) | def __init__(self,
method make_cond_schedule (line 469) | def make_cond_schedule(self, ):
method on_train_batch_start (line 476) | def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
method register_schedule (line 491) | def register_schedule(self,
method instantiate_first_stage (line 500) | def instantiate_first_stage(self, config):
method instantiate_cond_stage (line 507) | def instantiate_cond_stage(self, config):
method _get_denoise_row_from_list (line 528) | def _get_denoise_row_from_list(self, samples, desc='', force_no_decode...
method get_first_stage_encoding (line 540) | def get_first_stage_encoding(self, encoder_posterior):
method get_learned_conditioning (line 549) | def get_learned_conditioning(self, c):
method meshgrid (line 562) | def meshgrid(self, h, w):
method delta_border (line 569) | def delta_border(self, h, w):
method get_weighting (line 583) | def get_weighting(self, h, w, Ly, Lx, device):
method get_fold_unfold (line 599) | def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo...
method get_input (line 652) | def get_input(self, batch, k, return_first_stage_outputs=False, force_...
method decode_first_stage (line 704) | def decode_first_stage(self, z, predict_cids=False, force_not_quantize...
method differentiable_decode_first_stage (line 764) | def differentiable_decode_first_stage(self, z, predict_cids=False, for...
method encode_first_stage (line 824) | def encode_first_stage(self, x):
method shared_step (line 863) | def shared_step(self, batch, **kwargs):
method forward (line 868) | def forward(self, x, c, *args, **kwargs):
method _rescale_annotations (line 879) | def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: mov...
method apply_model (line 889) | def apply_model(self, x_noisy, t, cond, return_ids=False):
method _predict_eps_from_xstart (line 992) | def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
method _prior_bpd (line 996) | def _prior_bpd(self, x_start):
method p_losses (line 1010) | def p_losses(self, x_start, cond, t, noise=None):
method p_mean_variance (line 1045) | def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codeboo...
method p_sample (line 1077) | def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
method progressive_denoising (line 1108) | def progressive_denoising(self, cond, shape, verbose=True, callback=No...
method p_sample_loop (line 1164) | def p_sample_loop(self, cond, shape, return_intermediates=False,
method sample (line 1215) | def sample(self, cond, batch_size=16, return_intermediates=False, x_T=...
method sample_log (line 1233) | def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
method log_images (line 1249) | def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200,...
method configure_optimizers (line 1359) | def configure_optimizers(self):
method to_rgb (line 1384) | def to_rgb(self, x):
class DiffusionWrapper (line 1393) | class DiffusionWrapper(pl.LightningModule):
method __init__ (line 1394) | def __init__(self, diff_model_config, conditioning_key):
method forward (line 1400) | def forward(self, x, t, c_concat: list = None, c_crossattn: list = None):
class Layout2ImgDiffusion (line 1423) | class Layout2ImgDiffusion(LatentDiffusion):
method __init__ (line 1425) | def __init__(self, cond_stage_key, *args, **kwargs):
method log_images (line 1429) | def log_images(self, batch, N=8, *args, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/ddpm_audio.py
class LatentDiffusion_audio (line 35) | class LatentDiffusion_audio(DDPM):
method __init__ (line 37) | def __init__(self,
method make_cond_schedule (line 86) | def make_cond_schedule(self, ):
method on_train_batch_start (line 93) | def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
method register_schedule (line 108) | def register_schedule(self,
method instantiate_first_stage (line 117) | def instantiate_first_stage(self, config):
method instantiate_cond_stage (line 124) | def instantiate_cond_stage(self, config):
method _get_denoise_row_from_list (line 145) | def _get_denoise_row_from_list(self, samples, desc='', force_no_decode...
method get_first_stage_encoding (line 157) | def get_first_stage_encoding(self, encoder_posterior):
method get_learned_conditioning (line 166) | def get_learned_conditioning(self, c):
method get_unconditional_conditioning (line 181) | def get_unconditional_conditioning(self, batch_size, null_label=None):
method meshgrid (line 205) | def meshgrid(self, h, w):
method delta_border (line 212) | def delta_border(self, h, w):
method get_weighting (line 226) | def get_weighting(self, h, w, Ly, Lx, device):
method get_fold_unfold (line 242) | def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo...
method get_input (line 295) | def get_input(self, batch, k, return_first_stage_outputs=False, force_...
method decode_first_stage (line 352) | def decode_first_stage(self, z, predict_cids=False, force_not_quantize...
method differentiable_decode_first_stage (line 412) | def differentiable_decode_first_stage(self, z, predict_cids=False, for...
method encode_first_stage (line 472) | def encode_first_stage(self, x):
method shared_step (line 511) | def shared_step(self, batch, **kwargs):
method test_step (line 516) | def test_step(self,batch,batch_idx):
method forward (line 540) | def forward(self, x, c, *args, **kwargs):
method _rescale_annotations (line 551) | def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: mov...
method apply_model (line 561) | def apply_model(self, x_noisy, t, cond, return_ids=False):
method _predict_eps_from_xstart (line 664) | def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
method _prior_bpd (line 668) | def _prior_bpd(self, x_start):
method p_losses (line 682) | def p_losses(self, x_start, cond, t, noise=None):
method p_mean_variance (line 717) | def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codeboo...
method p_sample (line 749) | def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
method progressive_denoising (line 780) | def progressive_denoising(self, cond, shape, verbose=True, callback=No...
method p_sample_loop (line 836) | def p_sample_loop(self, cond, shape, return_intermediates=False,
method sample (line 887) | def sample(self, cond, batch_size=16, return_intermediates=False, x_T=...
method sample_log (line 905) | def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
method log_images (line 921) | def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200,...
method configure_optimizers (line 1037) | def configure_optimizers(self):
method to_rgb (line 1062) | def to_rgb(self, x):
class LatentFinetuneDiffusion (line 1071) | class LatentFinetuneDiffusion(LatentDiffusion_audio):
method __init__ (line 1077) | def __init__(self,
method init_from_ckpt (line 1101) | def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
method log_images (line 1133) | def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200,...
class LatentInpaintDiffusion (line 1213) | class LatentInpaintDiffusion(LatentFinetuneDiffusion):
method __init__ (line 1220) | def __init__(self,
method get_input (line 1230) | def get_input(self, batch, k, cond_key=None, bs=None, return_first_sta...
method log_images (line 1258) | def log_images(self, *args, **kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/ddpm_audio_inpaint.py
class LatentDiffusion_audioinpaint (line 34) | class LatentDiffusion_audioinpaint(DDPM):
method __init__ (line 36) | def __init__(self,
method make_cond_schedule (line 90) | def make_cond_schedule(self, ):
method on_train_batch_start (line 97) | def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
method register_schedule (line 112) | def register_schedule(self,
method instantiate_first_stage (line 121) | def instantiate_first_stage(self, config):
method instantiate_cond_stage (line 128) | def instantiate_cond_stage(self, config):
method _get_denoise_row_from_list (line 149) | def _get_denoise_row_from_list(self, samples, desc='', force_no_decode...
method get_first_stage_encoding (line 161) | def get_first_stage_encoding(self, encoder_posterior):# encode_emb fro...
method get_learned_conditioning (line 170) | def get_learned_conditioning(self, c):
method meshgrid (line 183) | def meshgrid(self, h, w):
method delta_border (line 190) | def delta_border(self, h, w):
method get_weighting (line 204) | def get_weighting(self, h, w, Ly, Lx, device):
method get_fold_unfold (line 220) | def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo...
method get_input (line 273) | def get_input(self, batch, k, return_first_stage_outputs=False, force_...
method decode_first_stage (line 348) | def decode_first_stage(self, z, predict_cids=False, force_not_quantize...
method differentiable_decode_first_stage (line 408) | def differentiable_decode_first_stage(self, z, predict_cids=False, for...
method encode_first_stage (line 468) | def encode_first_stage(self, x):
method shared_step (line 507) | def shared_step(self, batch, **kwargs):
method test_step (line 512) | def test_step(self,batch,batch_idx):
method forward (line 546) | def forward(self, x, c, *args, **kwargs):
method _rescale_annotations (line 561) | def _rescale_annotations(self, bboxes, crop_coordinates): # TODO: mov...
method apply_model (line 571) | def apply_model(self, x_noisy, t, cond, return_ids=False):
method _predict_eps_from_xstart (line 682) | def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
method _prior_bpd (line 686) | def _prior_bpd(self, x_start):
method p_losses (line 700) | def p_losses(self, x_start, cond, t, noise=None):
method p_mean_variance (line 735) | def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codeboo...
method p_sample (line 767) | def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
method progressive_denoising (line 798) | def progressive_denoising(self, cond, shape, verbose=True, callback=No...
method p_sample_loop (line 854) | def p_sample_loop(self, cond, shape, return_intermediates=False,
method sample (line 905) | def sample(self, cond, batch_size=16, return_intermediates=False, x_T=...
method sample_log (line 923) | def sample_log(self,cond,batch_size,ddim, ddim_steps,**kwargs):
method log_images (line 937) | def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200,...
method configure_optimizers (line 1049) | def configure_optimizers(self):
method to_rgb (line 1074) | def to_rgb(self, x):
FILE: text_to_audio/Make_An_Audio/ldm/models/diffusion/plms.py
class PLMSSampler (line 11) | class PLMSSampler(object):
method __init__ (line 12) | def __init__(self, model, schedule="linear", **kwargs):
method register_buffer (line 18) | def register_buffer(self, name, attr):
method make_schedule (line 24) | def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddi...
method sample (line 58) | def sample(self,
method plms_sampling (line 115) | def plms_sampling(self, cond, shape,
method p_sample_plms (line 173) | def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_origin...
FILE: text_to_audio/Make_An_Audio/ldm/modules/attention.py
function exists (line 11) | def exists(val):
function uniq (line 15) | def uniq(arr):
function default (line 19) | def default(val, d):
function max_neg_value (line 25) | def max_neg_value(t):
function init_ (line 29) | def init_(tensor):
class GEGLU (line 37) | class GEGLU(nn.Module):
method __init__ (line 38) | def __init__(self, dim_in, dim_out):
method forward (line 42) | def forward(self, x):
class FeedForward (line 47) | class FeedForward(nn.Module):
method __init__ (line 48) | def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
method forward (line 63) | def forward(self, x):
function zero_module (line 67) | def zero_module(module):
function Normalize (line 76) | def Normalize(in_channels):
class LinearAttention (line 80) | class LinearAttention(nn.Module):
method __init__ (line 81) | def __init__(self, dim, heads=4, dim_head=32):
method forward (line 88) | def forward(self, x):
class SpatialSelfAttention (line 99) | class SpatialSelfAttention(nn.Module):
method __init__ (line 100) | def __init__(self, in_channels):
method forward (line 126) | def forward(self, x):
class CrossAttention (line 152) | class CrossAttention(nn.Module):
method __init__ (line 153) | def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, ...
method forward (line 170) | def forward(self, x, context=None, mask=None):# x:(b,h*w,c), context:(...
class BasicTransformerBlock (line 196) | class BasicTransformerBlock(nn.Module):
method __init__ (line 197) | def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None,...
method forward (line 208) | def forward(self, x, context=None):
method _forward (line 211) | def _forward(self, x, context=None):
class SpatialTransformer (line 218) | class SpatialTransformer(nn.Module):
method __init__ (line 226) | def __init__(self, in_channels, n_heads, d_head,
method forward (line 250) | def forward(self, x, context=None):
FILE: text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/custom_openaimodel.py
class UNetModel (line 26) | class UNetModel(nn.Module):
method __init__ (line 56) | def __init__(
method convert_to_fp16 (line 315) | def convert_to_fp16(self):
method convert_to_fp32 (line 323) | def convert_to_fp32(self):
method forward (line 331) | def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
FILE: text_to_audio/Make_An_Audio/ldm/modules/diffusionmodules/model.py
function get_timestep_embedding (line 12) | def get_timestep_embedding(timesteps, embedding_dim):
function nonlinearity (line 33) | def nonlinearity(x):
function Normalize (line 38) | def Normalize(in_channels, num_groups=32):
class Upsample (line 42) | class Upsample(nn.Module):
method __init__ (line 43) | def __init__(self, in_channels, with_conv):
method forward (line 53) | def forward(self, x):
class Downsample (line 60) | class Downsample(nn.Module):
method __init__ (line 61) | def __init__(self, in_channels, with_conv):
method forward (line 72) | def forward(self, x):
class ResnetBlock (line 82) | class ResnetBlock(nn.Module):
method __init__ (line 83) | def __init__(self, *, in_channels, out_channels=None, conv_shortcut=Fa...
method forward (line 121) | def forward(self, x, temb):
class LinAttnBlock (line 144) | class LinAttnBlock(LinearAttention):
method __init__ (line 146) | def __init__(self, in_channels):
class AttnBlock (line 150) | class AttnBlock(nn.Module):
method __init__ (line 151) | def __init__(self, in_channels):
method forward (line 178) | def forward(self, x):
function make_attn (line 205) | def make_attn(in_channels, attn_type="vanilla"):
class Model (line 216) | class Model(nn.Module):
method __init__ (line 217) | def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
method forward (line 316) | def forward(self, x, t=None, context=None):
method get_last_layer (line 364) | def get_last_layer(self):
class Encoder (line 368) | class Encoder(nn.Module):
method __init__ (line 369) | def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
method forward (line 434) | def forward(self, x):
class Decoder (line 462) | class Decoder(nn.Module):
method __init__ (line 463) | def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
method forward (line 535) | def forward(self, z):
class SimpleDecoder (line 571) | class SimpleDecoder(nn.Module):
method __init__ (line 572) | def __init__(self, in_channels, out_channels, *args, **kwargs):
method forward (line 594) | def forward(self, x):
class UpsampleDecoder (line 607) | class UpsampleDecoder(nn.Module):
method __init__ (line 608) | def __init__(self, in_channels, out_channels, ch, num_res_blocks, reso...
method forward (line 641) | def forward(self, x):
class LatentRescaler (line 655) | class LatentRescaler(nn.Module):
method __init__ (line 656) | def __init__(self, factor, in_channels, mid_channels, out_channels, de...
method forward (line 680) | def forward(self, x):
class MergedRescaleEncoder (line 692) | class MergedRescaleEncoder(nn.Module):
method __init__ (line 693) | def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
method forward (line 705) | def forward(self, x):
class MergedRescaleDecoder (line 711) | class MergedRescaleDecoder(nn.Module):
method __init__ (line 712) | def __init__(self, z_channels, out_ch, resolution, num_res_blocks, att...
method forward (line 722) | def forward(self, x):
class Upsampler (line 728) | class Upsampler(nn.M
Copy disabled (too large)
Download .json
Condensed preview — 362 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (15,444K chars).
[
{
"path": ".gitignore",
"chars": 1776,
"preview": "# JetBrains PyCharm IDE\n.idea/\n.github/\n.circleci/\n\n# Byte-compiled / optimized / DLL files\n*__pycache__/\n__pycache__/\n*"
},
{
"path": "LICENSE",
"chars": 0,
"preview": ""
},
{
"path": "NeuralSeq/LICENSE",
"chars": 1067,
"preview": "MIT License\n\nCopyright (c) 2021 Jinglin Liu\n\nPermission is hereby granted, free of charge, to any person obtaining a cop"
},
{
"path": "NeuralSeq/README.md",
"chars": 78,
"preview": "In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger"
},
{
"path": "NeuralSeq/configs/config_base.yaml",
"chars": 712,
"preview": "# task\nbinary_data_dir: ''\nwork_dir: '' # experiment directory.\ninfer: false # infer\nseed: 1234\ndebug: false\nsave_codes:"
},
{
"path": "NeuralSeq/configs/singing/base.yaml",
"chars": 917,
"preview": "base_config:\n - configs/tts/base.yaml\n - configs/tts/base_zh.yaml\n\n\ndatasets: []\ntest_prefixes: []\ntest_num: 0\nvalid_n"
},
{
"path": "NeuralSeq/configs/singing/fs2.yaml",
"chars": 68,
"preview": "base_config:\n - configs/tts/fs2.yaml\n - configs/singing/base.yaml\n"
},
{
"path": "NeuralSeq/configs/tts/base.yaml",
"chars": 1997,
"preview": "# task\nbase_config: configs/config_base.yaml\ntask_cls: ''\n#############\n# dataset\n#############\nraw_data_dir: ''\nprocess"
},
{
"path": "NeuralSeq/configs/tts/base_zh.yaml",
"chars": 93,
"preview": "pre_align_args:\n txt_processor: zh_g2pM\nbinarizer_cls: data_gen.tts.binarizer_zh.ZhBinarizer"
},
{
"path": "NeuralSeq/configs/tts/emotion/base_text2mel.yaml",
"chars": 458,
"preview": "raw_data_dir: 'data/raw/ESD'\nprocessed_data_dir: 'data/processed/emotion'\nbinary_data_dir: 'data/binary/emotion'\npre_ali"
},
{
"path": "NeuralSeq/configs/tts/emotion/pre_align.py",
"chars": 849,
"preview": "import os\n\nfrom data_gen.tts.base_preprocess import BasePreprocessor\nimport glob\nimport re\n\nclass EmoPreAlign(BasePrepro"
},
{
"path": "NeuralSeq/configs/tts/fs2.yaml",
"chars": 1644,
"preview": "base_config: configs/tts/base.yaml\ntask_cls: tasks.tts.fs2.FastSpeech2Task\n\n# model\nhidden_size: 256\ndropout: 0.1\nencode"
},
{
"path": "NeuralSeq/configs/tts/hifigan.yaml",
"chars": 593,
"preview": "base_config: configs/tts/pwg.yaml\ntask_cls: tasks.vocoder.hifigan.HifiGanTask\nresblock: \"1\"\nadam_b1: 0.8\nadam_b2: 0.99\nu"
},
{
"path": "NeuralSeq/configs/tts/libritts/base_text2mel.yaml",
"chars": 362,
"preview": "raw_data_dir: 'data/raw/LibriTTS'\nprocessed_data_dir: 'data/processed/libritts'\nbinary_data_dir: 'data/binary/libritts'\n"
},
{
"path": "NeuralSeq/configs/tts/libritts/fs2.yaml",
"chars": 63,
"preview": "base_config:\n - configs/tts/fs2.yaml\n - ./base_text2mel.yaml\n"
},
{
"path": "NeuralSeq/configs/tts/libritts/pre_align.py",
"chars": 919,
"preview": "import os\n\nfrom data_gen.tts.base_preprocess import BasePreprocessor\nimport glob\n\n\nclass LibrittsPreAlign(BasePreprocess"
},
{
"path": "NeuralSeq/configs/tts/libritts/pwg.yaml",
"chars": 239,
"preview": "base_config: egs/egs_bases/tts/vocoder/pwg.yaml\nraw_data_dir: 'data/raw/LibriTTS'\nprocessed_data_dir: 'data/processed/li"
},
{
"path": "NeuralSeq/configs/tts/lj/base_mel2wav.yaml",
"chars": 128,
"preview": "raw_data_dir: 'data/raw/LJSpeech-1.1'\nprocessed_data_dir: 'data/processed/ljspeech'\nbinary_data_dir: 'data/binary/ljspee"
},
{
"path": "NeuralSeq/configs/tts/lj/base_text2mel.yaml",
"chars": 400,
"preview": "raw_data_dir: 'data/raw/LJSpeech-1.1'\nprocessed_data_dir: 'data/processed/ljspeech'\nbinary_data_dir: 'data/binary/ljspee"
},
{
"path": "NeuralSeq/configs/tts/lj/fs2.yaml",
"chars": 75,
"preview": "base_config:\n - configs/tts/fs2.yaml\n - configs/tts/lj/base_text2mel.yaml"
},
{
"path": "NeuralSeq/configs/tts/lj/hifigan.yaml",
"chars": 78,
"preview": "base_config:\n - configs/tts/hifigan.yaml\n - configs/tts/lj/base_mel2wav.yaml"
},
{
"path": "NeuralSeq/configs/tts/lj/pwg.yaml",
"chars": 74,
"preview": "base_config:\n - configs/tts/pwg.yaml\n - configs/tts/lj/base_mel2wav.yaml"
},
{
"path": "NeuralSeq/configs/tts/pwg.yaml",
"chars": 5186,
"preview": "base_config: configs/tts/base.yaml\ntask_cls: tasks.vocoder.pwg.PwgTask\n\nbinarization_args:\n with_wav: true\n with_spk_e"
},
{
"path": "NeuralSeq/data_gen/tts/base_binarizer.py",
"chars": 9284,
"preview": "import os\nos.environ[\"OMP_NUM_THREADS\"] = \"1\"\n\nfrom utils.multiprocess_utils import chunked_multiprocess_run\nimport rand"
},
{
"path": "NeuralSeq/data_gen/tts/base_binarizer_emotion.py",
"chars": 14996,
"preview": "import os\n\nos.environ[\"OMP_NUM_THREADS\"] = \"1\"\nimport torch\nfrom collections import Counter\nfrom utils.text_encoder impo"
},
{
"path": "NeuralSeq/data_gen/tts/base_preprocess.py",
"chars": 11912,
"preview": "import json\nimport os\nimport random\nimport re\nimport traceback\nfrom collections import Counter\nfrom functools import par"
},
{
"path": "NeuralSeq/data_gen/tts/binarizer_zh.py",
"chars": 2202,
"preview": "import os\n\nos.environ[\"OMP_NUM_THREADS\"] = \"1\"\n\nfrom data_gen.tts.txt_processors.zh_g2pM import ALL_SHENMU\nfrom data_gen"
},
{
"path": "NeuralSeq/data_gen/tts/data_gen_utils.py",
"chars": 13412,
"preview": "import warnings\n\nwarnings.filterwarnings(\"ignore\")\n\nimport parselmouth\nimport os\nimport torch\nfrom skimage.transform imp"
},
{
"path": "NeuralSeq/data_gen/tts/emotion/audio.py",
"chars": 4358,
"preview": "from scipy.ndimage.morphology import binary_dilation\nfrom data_gen.tts.emotion.params_data import *\nfrom pathlib import "
},
{
"path": "NeuralSeq/data_gen/tts/emotion/inference.py",
"chars": 7885,
"preview": "from data_gen.tts.emotion.params_data import *\nfrom data_gen.tts.emotion.model import EmotionEncoder\nfrom data_gen.tts.e"
},
{
"path": "NeuralSeq/data_gen/tts/emotion/model.py",
"chars": 3114,
"preview": "\nfrom data_gen.tts.emotion.params_model import *\nfrom data_gen.tts.emotion.params_data import *\nfrom torch.nn.utils impo"
},
{
"path": "NeuralSeq/data_gen/tts/emotion/params_data.py",
"chars": 901,
"preview": "\n## Mel-filterbank\nmel_window_length = 25 # In milliseconds\nmel_window_step = 10 # In milliseconds\nmel_n_channels = "
},
{
"path": "NeuralSeq/data_gen/tts/emotion/params_model.py",
"chars": 195,
"preview": "\n## Model parameters\nmodel_hidden_size = 256\nmodel_embedding_size = 256\nmodel_num_layers = 3\n\n\n## Training parameters\nle"
},
{
"path": "NeuralSeq/data_gen/tts/emotion/test_emotion.py",
"chars": 5703,
"preview": "#!/usr/bin/env python3 -u\n# Copyright (c) Facebook, Inc. and its affiliates.\n#\n# This source code is licensed under the "
},
{
"path": "NeuralSeq/data_gen/tts/txt_processors/__init__.py",
"chars": 16,
"preview": "from . import en"
},
{
"path": "NeuralSeq/data_gen/tts/txt_processors/base_text_processor.py",
"chars": 1466,
"preview": "from data_gen.tts.data_gen_utils import is_sil_phoneme\n\nREGISTERED_TEXT_PROCESSORS = {}\n\ndef register_txt_processors(nam"
},
{
"path": "NeuralSeq/data_gen/tts/txt_processors/en.py",
"chars": 2552,
"preview": "import re\nimport unicodedata\n\nfrom g2p_en import G2p\nfrom g2p_en.expand import normalize_numbers\nfrom nltk import pos_ta"
},
{
"path": "NeuralSeq/data_gen/tts/txt_processors/zh.py",
"chars": 1680,
"preview": "import re\nimport jieba\nfrom pypinyin import pinyin, Style\nfrom data_gen.tts.data_gen_utils import PUNCS\nfrom data_gen.tt"
},
{
"path": "NeuralSeq/data_gen/tts/txt_processors/zh_g2pM.py",
"chars": 2636,
"preview": "import re\nimport jieba\nfrom pypinyin import pinyin, Style\nfrom data_gen.tts.data_gen_utils import PUNCS\nfrom data_gen.tt"
},
{
"path": "NeuralSeq/data_gen/tts/wav_processors/__init__.py",
"chars": 61,
"preview": "from . import base_processor\nfrom . import common_processors\n"
},
{
"path": "NeuralSeq/data_gen/tts/wav_processors/base_processor.py",
"chars": 557,
"preview": "REGISTERED_WAV_PROCESSORS = {}\n\n\ndef register_wav_processors(name):\n def _f(cls):\n REGISTERED_WAV_PROCESSORS[n"
},
{
"path": "NeuralSeq/data_gen/tts/wav_processors/common_processors.py",
"chars": 2951,
"preview": "import os\nimport subprocess\nimport librosa\nimport numpy as np\nfrom data_gen.tts.wav_processors.base_processor import Bas"
},
{
"path": "NeuralSeq/egs/datasets/audio/emotion/base_text2mel.yaml",
"chars": 465,
"preview": "raw_data_dir: 'data/raw/ESD'\nprocessed_data_dir: 'data/processed/emotion'\nbinary_data_dir: 'data/binary/emotion'\npre_ali"
},
{
"path": "NeuralSeq/egs/datasets/audio/emotion/pre_align.py",
"chars": 849,
"preview": "import os\n\nfrom data_gen.tts.base_preprocess import BasePreprocessor\nimport glob\nimport re\n\nclass EmoPreAlign(BasePrepro"
},
{
"path": "NeuralSeq/egs/datasets/audio/libritts/base_text2mel.yaml",
"chars": 369,
"preview": "raw_data_dir: 'data/raw/LibriTTS'\nprocessed_data_dir: 'data/processed/libritts'\nbinary_data_dir: 'data/binary/libritts'\n"
},
{
"path": "NeuralSeq/egs/datasets/audio/libritts/fs2.yaml",
"chars": 69,
"preview": "base_config:\n - egs/egs_bases/tts/fs2.yaml\n - ./base_text2mel.yaml\n"
},
{
"path": "NeuralSeq/egs/datasets/audio/libritts/pre_align.py",
"chars": 919,
"preview": "import os\n\nfrom data_gen.tts.base_preprocess import BasePreprocessor\nimport glob\n\n\nclass LibrittsPreAlign(BasePreprocess"
},
{
"path": "NeuralSeq/egs/datasets/audio/libritts/pwg.yaml",
"chars": 239,
"preview": "base_config: egs/egs_bases/tts/vocoder/pwg.yaml\nraw_data_dir: 'data/raw/LibriTTS'\nprocessed_data_dir: 'data/processed/li"
},
{
"path": "NeuralSeq/egs/datasets/audio/lj/base_mel2wav.yaml",
"chars": 170,
"preview": "raw_data_dir: 'data/raw/LJSpeech-1.1'\nprocessed_data_dir: 'data/processed/ljspeech'\nbinary_data_dir: 'data/binary/ljspee"
},
{
"path": "NeuralSeq/egs/datasets/audio/lj/preprocess.py",
"chars": 386,
"preview": "from data_gen.tts.base_preprocess import BasePreprocessor\n\n\nclass LJPreprocess(BasePreprocessor):\n def meta_data(self"
},
{
"path": "NeuralSeq/egs/datasets/audio/lj/pwg.yaml",
"chars": 75,
"preview": "base_config:\n - egs/egs_bases/tts/vocoder/pwg.yaml\n - ./base_mel2wav.yaml"
},
{
"path": "NeuralSeq/egs/datasets/audio/vctk/base_mel2wav.yaml",
"chars": 119,
"preview": "raw_data_dir: 'data/raw/VCTK-Corpus'\nprocessed_data_dir: 'data/processed/vctk'\nbinary_data_dir: 'data/binary/vctk_wav'\n"
},
{
"path": "NeuralSeq/egs/datasets/audio/vctk/fs2.yaml",
"chars": 320,
"preview": "base_config:\n - egs/egs_bases/tts/fs2.yaml\nraw_data_dir: 'data/raw/VCTK-Corpus'\nprocessed_data_dir: 'data/processed/vct"
},
{
"path": "NeuralSeq/egs/datasets/audio/vctk/pre_align.py",
"chars": 677,
"preview": "import os\n\nfrom data_gen.tts.base_pre_align import BasePreAlign\nimport glob\n\n\nclass VCTKPreAlign(BasePreAlign):\n def "
},
{
"path": "NeuralSeq/egs/datasets/audio/vctk/pwg.yaml",
"chars": 109,
"preview": "base_config:\n - egs/egs_bases/tts/vocoder/pwg.yaml\n - ./base_mel2wav.yaml\n\nnum_spk: 400\nmax_samples: 20480\n"
},
{
"path": "NeuralSeq/egs/egs_bases/config_base.yaml",
"chars": 831,
"preview": "# task\nbinary_data_dir: ''\nwork_dir: '' # experiment directory.\ninfer: false # inference\namp: false\nseed: 1234\ndebug: fa"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/base.yaml",
"chars": 468,
"preview": "task_cls: tasks.svs.task.DiffFsTask\npitch_type: frame\ntimesteps: 100\ndilation_cycle_length: 1\nresidual_layers: 20\nresidu"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml",
"chars": 2237,
"preview": "base_config:\n - configs/tts/lj/fs2.yaml\n - ./base.yaml\n# spec_min and spec_max are calculated on the training set.\nspe"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml",
"chars": 1371,
"preview": "base_config:\n - configs/singing/fs2.yaml\n - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml\n\naudio_sample_r"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml",
"chars": 861,
"preview": "base_config:\n - egs/egs_bases/svs/popcs_ds_beta6.yaml\n - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml\n\nb"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml",
"chars": 1970,
"preview": "spec_min: [-6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6., -6.,\n -6., -6., -6., -6., -6., -6., -6., -6."
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml",
"chars": 843,
"preview": "base_config:\n - egs/egs_bases/svs/popcs_ds_beta6.yaml\n - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml\n\nb"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml",
"chars": 886,
"preview": "base_config:\n - egs/egs_bases/svs/popcs_ds_beta6.yaml\n - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml\n\nb"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml",
"chars": 810,
"preview": "base_config:\n - egs/egs_bases/svs/popcs_ds_beta6.yaml\n - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml\n\nb"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml",
"chars": 806,
"preview": "base_config:\n - egs/egs_bases/svs/popcs_ds_beta6.yaml\n - egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml\n\nbinar"
},
{
"path": "NeuralSeq/egs/egs_bases/svs/midi/pe.yaml",
"chars": 454,
"preview": "base_config:\n - configs/tts/lj/fs2.yaml\n\nmax_frames: 8000\naudio_sample_rate: 24000\nhop_size: 128 # Hop size."
},
{
"path": "NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml",
"chars": 2733,
"preview": "base_config:\n - configs/tts/fs2.yaml\n - configs/singing/base.yaml\n - ./base.yaml\n\naudio_sample_rate: 24000\nhop_size: "
},
{
"path": "NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml",
"chars": 329,
"preview": "base_config:\n - ./popcs_ds_beta6.yaml\n\nfs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be "
},
{
"path": "NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml",
"chars": 916,
"preview": "base_config:\n - configs/singing/fs2.yaml\n\naudio_sample_rate: 24000\nhop_size: 128 # Hop size.\nfft_size: 512 "
},
{
"path": "NeuralSeq/egs/egs_bases/tts/base.yaml",
"chars": 2361,
"preview": "# task\nbase_config: ../config_base.yaml\ntask_cls: ''\n#############\n# dataset\n#############\nraw_data_dir: ''\nprocessed_da"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/base_zh.yaml",
"chars": 95,
"preview": "base_config: ./base.yaml\npreprocess_args:\n txt_processor: zh\n use_tone: true\n\nword_size: 3000"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/fs2.yaml",
"chars": 2175,
"preview": "base_config: ./base.yaml\ntask_cls: tasks.tts.fs2.FastSpeech2Task\n\n# model\nhidden_size: 256\ndropout: 0.1\nencoder_type: ff"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/fs2_adv.yaml",
"chars": 1165,
"preview": "base_config: ./fs2.yaml\ntask_cls: tasks.tts.fs2_adv.FastSpeech2AdvTask\n\ndisc_win_num: 3\ndisc_interval: 1\ndisc_reduction:"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/ps.yaml",
"chars": 1157,
"preview": "base_config: ./fs2.yaml\n\n###########################\n# models\n###########################\n# encoders\nhidden_size: 192\nff"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/ps_flow.yaml",
"chars": 459,
"preview": "base_config: ./ps2.yaml\ntask_cls: tasks.tts.ps_flow.PortaSpeechFlowTask\n\nuse_post_flow: true\ndetach_postflow_input: true"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/ps_flow_small.yaml",
"chars": 747,
"preview": "base_config: ./ps_flow.yaml\n\n###########################\n# models\n###########################\n# encoders\nhidden_size: 12"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/vocoder/base.yaml",
"chars": 1149,
"preview": "base_config: ../base.yaml\nbinarization_args:\n with_wav: true\n with_spk_embed: false\n with_align: false\n with_word: f"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/vocoder/hifigan.yaml",
"chars": 629,
"preview": "base_config: ./base.yaml\ntask_cls: tasks.vocoder.hifigan.HifiGanTask\nresblock: \"1\"\nadam_b1: 0.8\nadam_b2: 0.99\nupsample_r"
},
{
"path": "NeuralSeq/egs/egs_bases/tts/vocoder/pwg.yaml",
"chars": 4285,
"preview": "base_config: ./base.yaml\ntask_cls: tasks.vocoder.pwg.PwgTask\n\naux_context_window: 2 # Context window size for auxiliary "
},
{
"path": "NeuralSeq/gitattributes",
"chars": 1467,
"preview": "*.7z filter=lfs diff=lfs merge=lfs -text\n*.arrow filter=lfs diff=lfs merge=lfs -text\n*.bin filter=lfs diff=lfs merge=lfs"
},
{
"path": "NeuralSeq/inference/svs/base_svs_infer.py",
"chars": 11859,
"preview": "import os\n\nimport torch\nimport numpy as np\nfrom modules.hifigan.hifigan import HifiGanGenerator\nfrom vocoders.hifigan im"
},
{
"path": "NeuralSeq/inference/svs/ds_cascade.py",
"chars": 3026,
"preview": "import torch\nfrom inference.svs.base_svs_infer import BaseSVSInfer\nfrom utils import load_ckpt\nfrom utils.hparams import"
},
{
"path": "NeuralSeq/inference/svs/ds_e2e.py",
"chars": 3596,
"preview": "import torch\n# from inference.tts.fs import FastSpeechInfer\n# from modules.tts.fs2_orig import FastSpeech2Orig\nfrom infe"
},
{
"path": "NeuralSeq/inference/svs/opencpop/cpop_pinyin2ph.txt",
"chars": 9195,
"preview": "| a | a |\n| ai | ai |\n| an | an |\n| ang | ang |\n| ao | ao |\n| ba |"
},
{
"path": "NeuralSeq/inference/svs/opencpop/map.py",
"chars": 426,
"preview": "def cpop_pinyin2ph_func():\n # In the README file of opencpop dataset, they defined a \"pinyin to phoneme mapping table"
},
{
"path": "NeuralSeq/inference/tts/GenerSpeech.py",
"chars": 5502,
"preview": "import torch\nimport os\nimport importlib\nfrom inference.tts.base_tts_infer import BaseTTSInfer\nfrom utils.ckpt_utils impo"
},
{
"path": "NeuralSeq/inference/tts/PortaSpeech.py",
"chars": 3269,
"preview": "import torch\nfrom inference.tts.base_tts_infer import BaseTTSInfer\nfrom utils.ckpt_utils import load_ckpt\nfrom modules.p"
},
{
"path": "NeuralSeq/inference/tts/base_tts_infer.py",
"chars": 3680,
"preview": "from tasks.tts.dataset_utils import FastSpeechWordDataset\nfrom tasks.tts.tts_utils import load_data_preprocessor\nfrom vo"
},
{
"path": "NeuralSeq/modules/GenerSpeech/config/generspeech.yaml",
"chars": 2434,
"preview": "base_config:\n - egs/egs_bases/tts/fs2.yaml\n - egs/datasets/audio/emotion/base_text2mel.yaml\n\ntask_cls: modules.GenerSp"
},
{
"path": "NeuralSeq/modules/GenerSpeech/model/generspeech.py",
"chars": 13228,
"preview": "import torch\nfrom modules.GenerSpeech.model.glow_modules import Glow\nfrom modules.fastspeech.tts_modules import PitchPre"
},
{
"path": "NeuralSeq/modules/GenerSpeech/model/glow_modules.py",
"chars": 28849,
"preview": "import scipy\nfrom torch.nn import functional as F\nimport torch\nfrom torch import nn\nimport numpy as np\nfrom modules.comm"
},
{
"path": "NeuralSeq/modules/GenerSpeech/model/mixstyle.py",
"chars": 1916,
"preview": "from modules.commons.common_layers import *\nimport random\n\n\nclass MixStyle(nn.Module):\n \"\"\"MixStyle.\n Reference:\n "
},
{
"path": "NeuralSeq/modules/GenerSpeech/model/prosody_util.py",
"chars": 16054,
"preview": "from torch import nn\nimport copy\nimport torch\nfrom utils.hparams import hparams\nfrom modules.GenerSpeech.model.wavenet i"
},
{
"path": "NeuralSeq/modules/GenerSpeech/model/wavenet.py",
"chars": 3467,
"preview": "from modules.commons.common_layers import *\n\n\n# @torch.jit.script\ndef fused_add_tanh_sigmoid_multiply(input_a, input_b, "
},
{
"path": "NeuralSeq/modules/GenerSpeech/task/dataset.py",
"chars": 8833,
"preview": "import matplotlib\nmatplotlib.use('Agg')\nfrom tasks.base_task import data_loader\nfrom tasks.tts.fs2 import FastSpeech2Tas"
},
{
"path": "NeuralSeq/modules/GenerSpeech/task/generspeech.py",
"chars": 12988,
"preview": "import matplotlib\nmatplotlib.use('Agg')\nfrom data_gen.tts.data_gen_utils import get_pitch\nfrom modules.fastspeech.tts_mo"
},
{
"path": "NeuralSeq/modules/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "NeuralSeq/modules/commons/align_ops.py",
"chars": 719,
"preview": "import torch\nimport torch.nn.functional as F\n\n\ndef build_word_mask(x2word, y2word):\n return (x2word[:, :, None] == y2"
},
{
"path": "NeuralSeq/modules/commons/common_layers.py",
"chars": 26403,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import Parameter\nimport torch.onnx.operators\nimport torch.nn"
},
{
"path": "NeuralSeq/modules/commons/conv.py",
"chars": 6260,
"preview": "import math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom modules.commons.common_layers impor"
},
{
"path": "NeuralSeq/modules/commons/espnet_positional_embedding.py",
"chars": 3973,
"preview": "import math\nimport torch\n\n\nclass PositionalEncoding(torch.nn.Module):\n \"\"\"Positional encoding.\n Args:\n d_mo"
},
{
"path": "NeuralSeq/modules/commons/normalizing_flow/glow_modules.py",
"chars": 13972,
"preview": "import scipy\nfrom torch.nn import functional as F\nimport torch\nfrom torch import nn\nimport numpy as np\nfrom modules.comm"
},
{
"path": "NeuralSeq/modules/commons/normalizing_flow/res_flow.py",
"chars": 2219,
"preview": "import torch\nfrom torch import nn\nfrom modules.commons.conv import ConditionalConvBlocks\nfrom modules.commons.wavenet im"
},
{
"path": "NeuralSeq/modules/commons/normalizing_flow/utils.py",
"chars": 888,
"preview": "import torch\n\n\ndef squeeze(x, x_mask=None, n_sqz=2):\n b, c, t = x.size()\n\n t = (t // n_sqz) * n_sqz\n x = x[:, :"
},
{
"path": "NeuralSeq/modules/commons/rel_transformer.py",
"chars": 24198,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\nfrom utils.hparams import hparams\nfro"
},
{
"path": "NeuralSeq/modules/commons/ssim.py",
"chars": 14079,
"preview": "# '''\n# https://github.com/One-sixth/ms_ssim_pytorch/blob/master/ssim.py\n# '''\n#\n# import torch\n# import torch.jit\n# imp"
},
{
"path": "NeuralSeq/modules/commons/transformer.py",
"chars": 31019,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import Parameter, Linear\nfrom modules.commons.common_layers "
},
{
"path": "NeuralSeq/modules/commons/wavenet.py",
"chars": 3791,
"preview": "import torch\nfrom torch import nn\n\n\ndef fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):\n n_channels_in"
},
{
"path": "NeuralSeq/modules/diff/candidate_decoder.py",
"chars": 3789,
"preview": "from modules.fastspeech.tts_modules import FastspeechDecoder\n# from modules.fastspeech.fast_tacotron import DecoderRNN\n#"
},
{
"path": "NeuralSeq/modules/diff/diffusion.py",
"chars": 12137,
"preview": "import math\nimport random\nfrom functools import partial\nfrom inspect import isfunction\nfrom pathlib import Path\nimport n"
},
{
"path": "NeuralSeq/modules/diff/net.py",
"chars": 4361,
"preview": "import math\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom math import sqrt\n\nfrom .diffusion "
},
{
"path": "NeuralSeq/modules/diff/shallow_diffusion_tts.py",
"chars": 14247,
"preview": "import math\r\nimport random\r\nfrom collections import deque\r\nfrom functools import partial\r\nfrom inspect import isfunction"
},
{
"path": "NeuralSeq/modules/diffsinger_midi/fs2.py",
"chars": 5275,
"preview": "from modules.commons.common_layers import *\nfrom modules.commons.common_layers import Embedding\nfrom modules.fastspeech."
},
{
"path": "NeuralSeq/modules/fastspeech/fs2.py",
"chars": 11536,
"preview": "from utils.hparams import hparams\nfrom modules.commons.common_layers import *\nfrom modules.commons.common_layers import "
},
{
"path": "NeuralSeq/modules/fastspeech/pe.py",
"chars": 5538,
"preview": "from modules.commons.common_layers import *\nfrom utils.hparams import hparams\nfrom modules.fastspeech.tts_modules import"
},
{
"path": "NeuralSeq/modules/fastspeech/tts_modules.py",
"chars": 16394,
"preview": "import logging\nimport math\n\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\n\nfrom modules.common"
},
{
"path": "NeuralSeq/modules/hifigan/hifigan.py",
"chars": 14000,
"preview": "import torch\r\nimport torch.nn.functional as F\r\nimport torch.nn as nn\r\nfrom torch.nn import Conv1d, ConvTranspose1d, AvgP"
},
{
"path": "NeuralSeq/modules/hifigan/mel_utils.py",
"chars": 2935,
"preview": "import numpy as np\r\nimport torch\r\nimport torch.utils.data\r\nfrom librosa.filters import mel as librosa_mel_fn\r\nfrom scipy"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/__init__.py",
"chars": 202,
"preview": "from .causal_conv import * # NOQA\nfrom .pqmf import * # NOQA\nfrom .residual_block import * # NOQA\nfrom modules.parall"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/causal_conv.py",
"chars": 1771,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2020 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"Causal"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/pqmf.py",
"chars": 4252,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2020 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"Pseudo"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/residual_block.py",
"chars": 4425,
"preview": "# -*- coding: utf-8 -*-\n\n\"\"\"Residual block module in WaveNet.\n\nThis code is modified from https://github.com/r9y9/wavene"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/residual_stack.py",
"chars": 2968,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2020 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"Residu"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/tf_layers.py",
"chars": 3853,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2020 MINH ANH (@dathudeptrai)\n# MIT License (https://opensource.org/licenses/MIT)\n"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/layers/upsample.py",
"chars": 6474,
"preview": "# -*- coding: utf-8 -*-\n\n\"\"\"Upsampling module.\n\nThis code is modified from https://github.com/r9y9/wavenet_vocoder.\n\n\"\"\""
},
{
"path": "NeuralSeq/modules/parallel_wavegan/losses/__init__.py",
"chars": 33,
"preview": "from .stft_loss import * # NOQA\n"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/losses/stft_loss.py",
"chars": 4928,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2019 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"STFT-b"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/models/__init__.py",
"chars": 70,
"preview": "from .melgan import * # NOQA\nfrom .parallel_wavegan import * # NOQA\n"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/models/melgan.py",
"chars": 16573,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2020 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"MelGAN"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/models/parallel_wavegan.py",
"chars": 16909,
"preview": "# -*- coding: utf-8 -*-\r\n\r\n# Copyright 2019 Tomoki Hayashi\r\n# MIT License (https://opensource.org/licenses/MIT)\r\n\r\n\"\"\"P"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/models/source.py",
"chars": 21259,
"preview": "import torch\nimport numpy as np\nimport sys\nimport torch.nn.functional as torch_nn_func\n\n\nclass SineGen(torch.nn.Module):"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/optimizers/__init__.py",
"chars": 63,
"preview": "from torch.optim import * # NOQA\nfrom .radam import * # NOQA\n"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/optimizers/radam.py",
"chars": 3386,
"preview": "# -*- coding: utf-8 -*-\n\n\"\"\"RAdam optimizer.\n\nThis code is drived from https://github.com/LiyuanLucasLiu/RAdam.\n\"\"\"\n\nimp"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/stft_loss.py",
"chars": 3404,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2019 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"STFT-b"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/utils/__init__.py",
"chars": 29,
"preview": "from .utils import * # NOQA\n"
},
{
"path": "NeuralSeq/modules/parallel_wavegan/utils/utils.py",
"chars": 4750,
"preview": "# -*- coding: utf-8 -*-\n\n# Copyright 2019 Tomoki Hayashi\n# MIT License (https://opensource.org/licenses/MIT)\n\n\"\"\"Utilit"
},
{
"path": "NeuralSeq/modules/syntaspeech/multi_window_disc.py",
"chars": 4792,
"preview": "import numpy as np\nimport torch\nimport torch.nn as nn\n\n\nclass SingleWindowDisc(nn.Module):\n def __init__(self, time_l"
},
{
"path": "NeuralSeq/modules/syntaspeech/syntactic_graph_buider.py",
"chars": 13548,
"preview": "from copy import deepcopy\nimport torch\nimport dgl\nimport stanza \nimport networkx as nx\n\nclass Sentence2GraphParser:\n "
},
{
"path": "NeuralSeq/modules/syntaspeech/syntactic_graph_encoder.py",
"chars": 8359,
"preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nimport dgl\nfrom dgl.nn.pytorch import GatedGraphConv"
},
{
"path": "NeuralSeq/modules/syntaspeech/syntaspeech.py",
"chars": 13948,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import Linear\nfrom utils.hparams import hparams\nfrom modules"
},
{
"path": "NeuralSeq/tasks/base_task.py",
"chars": 11892,
"preview": "import glob\nimport re\nimport subprocess\nfrom datetime import datetime\n\nimport matplotlib\n\nmatplotlib.use('Agg')\n\nfrom ut"
},
{
"path": "NeuralSeq/tasks/run.py",
"chars": 371,
"preview": "import importlib\nfrom utils.hparams import set_hparams, hparams\n\n\ndef run_task():\n assert hparams['task_cls'] != ''\n "
},
{
"path": "NeuralSeq/tasks/svs/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "NeuralSeq/tasks/svs/diffsinger_task.py",
"chars": 22759,
"preview": "import torch\n\nimport utils\nfrom utils.hparams import hparams\nfrom modules.diff.net import DiffNet\nfrom modules.diff.shal"
},
{
"path": "NeuralSeq/tasks/svs/diffspeech_task.py",
"chars": 5367,
"preview": "import torch\n\nimport utils\nfrom utils.hparams import hparams\nfrom modules.diff.net import DiffNet\nfrom modules.diff.shal"
},
{
"path": "NeuralSeq/tasks/svs/task.py",
"chars": 3498,
"preview": "import torch\n\nimport utils\nfrom modules.diff.diffusion import GaussianDiffusion\nfrom modules.diff.net import DiffNet\nfro"
},
{
"path": "NeuralSeq/tasks/tts/dataset_utils.py",
"chars": 11866,
"preview": "from utils.cwt import get_lf0_cwt\nimport torch.optim\nimport torch.utils.data\nimport importlib\nfrom utils.indexed_dataset"
},
{
"path": "NeuralSeq/tasks/tts/fs2.py",
"chars": 24114,
"preview": "import matplotlib\nmatplotlib.use('Agg')\nfrom utils import audio\nimport matplotlib.pyplot as plt\nfrom data_gen.tts.data_g"
},
{
"path": "NeuralSeq/tasks/tts/fs2_adv.py",
"chars": 5935,
"preview": "from tasks.tts.fs2 import FastSpeech2Task\r\nfrom modules.syntaspeech.multi_window_disc import Discriminator\r\nfrom utils.h"
},
{
"path": "NeuralSeq/tasks/tts/fs2_utils.py",
"chars": 7251,
"preview": "import matplotlib\n\nmatplotlib.use('Agg')\n\nimport glob\nimport importlib\nfrom utils.cwt import get_lf0_cwt\nimport os\nimpor"
},
{
"path": "NeuralSeq/tasks/tts/pe.py",
"chars": 6153,
"preview": "import matplotlib\nmatplotlib.use('Agg')\n\nimport torch\nimport numpy as np\nimport os\n\nfrom tasks.base_task import BaseData"
},
{
"path": "NeuralSeq/tasks/tts/ps.py",
"chars": 9016,
"preview": "import os\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n\nfrom modules.portaspeech.portaspeech import"
},
{
"path": "NeuralSeq/tasks/tts/ps_adv.py",
"chars": 18572,
"preview": "import os\nimport torch\nimport torch.nn.functional as F\nimport torch.nn as nn\nimport numpy as np\n\nfrom modules.portaspeec"
},
{
"path": "NeuralSeq/tasks/tts/ps_flow.py",
"chars": 6954,
"preview": "import torch\nfrom modules.portaspeech.portaspeech_flow import PortaSpeechFlow\nfrom tasks.tts.fs2 import FastSpeech2Task\n"
},
{
"path": "NeuralSeq/tasks/tts/synta.py",
"chars": 1165,
"preview": "import os\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n\nfrom modules.tts.syntaspeech.syntaspeech im"
},
{
"path": "NeuralSeq/tasks/tts/tts.py",
"chars": 4709,
"preview": "from multiprocessing.pool import Pool\n\nimport matplotlib\n\nfrom utils.pl_utils import data_loader\nfrom utils.training_uti"
},
{
"path": "NeuralSeq/tasks/tts/tts_base.py",
"chars": 13642,
"preview": "import filecmp\n\nimport matplotlib\n\nfrom utils.plot import spec_to_figure\n\nmatplotlib.use('Agg')\n\nfrom data_gen.tts.data_"
},
{
"path": "NeuralSeq/tasks/tts/tts_utils.py",
"chars": 1917,
"preview": "import importlib\n\nfrom data_gen.tts.base_binarizer import BaseBinarizer\nfrom data_gen.tts.base_preprocess import BasePre"
},
{
"path": "NeuralSeq/tasks/vocoder/dataset_utils.py",
"chars": 9199,
"preview": "import glob\nimport importlib\nimport os\nfrom resemblyzer import VoiceEncoder\nimport numpy as np\nimport torch\nimport torch"
},
{
"path": "NeuralSeq/tasks/vocoder/vocoder_base.py",
"chars": 2386,
"preview": "import os\n\nimport torch\nimport torch.distributed as dist\nfrom torch.utils.data import DistributedSampler\n\nfrom tasks.bas"
},
{
"path": "NeuralSeq/utils/__init__.py",
"chars": 8094,
"preview": "import glob\nimport logging\nimport re\nimport time\nfrom collections import defaultdict\nimport os\nimport sys\nimport shutil\n"
},
{
"path": "NeuralSeq/utils/audio.py",
"chars": 3315,
"preview": "import subprocess\nimport matplotlib\nimport os\nmatplotlib.use('Agg')\nimport librosa\nimport librosa.filters\nimport numpy a"
},
{
"path": "NeuralSeq/utils/ckpt_utils.py",
"chars": 2715,
"preview": "import glob\nimport logging\nimport os\nimport re\nimport torch\n\n\ndef get_last_checkpoint(work_dir, steps=None):\n checkpo"
},
{
"path": "NeuralSeq/utils/cwt.py",
"chars": 4340,
"preview": "import librosa\nimport numpy as np\nfrom pycwt import wavelet\nfrom scipy.interpolate import interp1d\n\n\ndef load_wav(wav_fi"
},
{
"path": "NeuralSeq/utils/dtw.py",
"chars": 5932,
"preview": "from numpy import array, zeros, full, argmin, inf, ndim\nfrom scipy.spatial.distance import cdist\nfrom math import isinf\n"
},
{
"path": "NeuralSeq/utils/hparams.py",
"chars": 5210,
"preview": "import argparse\nimport os\nimport yaml\n\nglobal_print_hparams = True\nhparams = {}\n\n\nclass Args:\n def __init__(self, **k"
},
{
"path": "NeuralSeq/utils/indexed_datasets.py",
"chars": 2198,
"preview": "import pickle\nfrom copy import deepcopy\n\nimport numpy as np\n\n\nclass IndexedDataset:\n def __init__(self, path, num_cac"
},
{
"path": "NeuralSeq/utils/multiprocess_utils.py",
"chars": 2054,
"preview": "import os\nimport traceback\nfrom multiprocessing import Queue, Process\n\n\ndef chunked_worker(worker_id, map_func, args, re"
},
{
"path": "NeuralSeq/utils/os_utils.py",
"chars": 524,
"preview": "import os\nimport subprocess\n\n\ndef link_file(from_file, to_file):\n subprocess.check_call(\n f'ln -s \"`realpath -"
},
{
"path": "NeuralSeq/utils/pitch_utils.py",
"chars": 2178,
"preview": "#########\n# world\n##########\nimport librosa\nimport numpy as np\nimport torch\n\ngamma = 0\nmcepInput = 3 # 0 for dB, 3 for "
},
{
"path": "NeuralSeq/utils/pl_utils.py",
"chars": 58475,
"preview": "import matplotlib\nfrom torch.nn import DataParallel\nfrom torch.nn.parallel import DistributedDataParallel\n\nmatplotlib.us"
},
{
"path": "NeuralSeq/utils/plot.py",
"chars": 1830,
"preview": "import matplotlib.pyplot as plt\nimport numpy as np\nimport torch\n\nLINE_COLORS = ['w', 'r', 'y', 'cyan', 'm', 'b', 'lime']"
},
{
"path": "NeuralSeq/utils/text_encoder.py",
"chars": 10196,
"preview": "import re\nimport six\nfrom six.moves import range # pylint: disable=redefined-builtin\n\nPAD = \"<pad>\"\nEOS = \"<EOS>\"\nUNK ="
},
{
"path": "NeuralSeq/utils/text_norm.py",
"chars": 26999,
"preview": "# coding=utf-8\n# Authors:\n# 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)\n# 2019.9 Jiayu DU\n#\n"
},
{
"path": "NeuralSeq/utils/training_utils.py",
"chars": 976,
"preview": "from utils.hparams import hparams\n\n\nclass RSQRTSchedule(object):\n def __init__(self, optimizer):\n super().__in"
},
{
"path": "NeuralSeq/utils/tts_utils.py",
"chars": 14640,
"preview": "from collections import defaultdict\nimport torch\nimport torch.nn.functional as F\n\n\ndef make_positions(tensor, padding_id"
},
{
"path": "NeuralSeq/vocoders/__init__.py",
"chars": 29,
"preview": "from vocoders import hifigan\n"
},
{
"path": "NeuralSeq/vocoders/base_vocoder.py",
"chars": 841,
"preview": "import importlib\nVOCODERS = {}\n\n\ndef register_vocoder(cls):\n VOCODERS[cls.__name__.lower()] = cls\n VOCODERS[cls.__"
},
{
"path": "NeuralSeq/vocoders/hifigan.py",
"chars": 2871,
"preview": "import glob\nimport json\nimport os\nimport re\n\nimport librosa\nimport torch\n\nimport utils\nfrom modules.hifigan.hifigan impo"
},
{
"path": "NeuralSeq/vocoders/pwg.py",
"chars": 5604,
"preview": "import glob\nimport re\nimport librosa\nimport torch\nimport yaml\nfrom sklearn.preprocessing import StandardScaler\nfrom torc"
},
{
"path": "NeuralSeq/vocoders/vocoder_utils.py",
"chars": 516,
"preview": "import librosa\n\nfrom utils.hparams import hparams\nimport numpy as np\n\n\ndef denoise(wav, v=0.1):\n spec = librosa.stft("
},
{
"path": "README.md",
"chars": 3637,
"preview": "# AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head\n\n[))\nsys.path.append(os.path.dirname(os.pat"
},
{
"path": "audio_detection/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "audio_detection/audio_infer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv",
"chars": 74575,
"preview": "-JMT0mK0Dbg_30.000_40.000.wav\t30.000\t40.000\tTrain horn\n3ACjUf9QpAQ_30.000_40.000.wav\t30.000\t40.000\tTrain horn\n3S2-TODd__"
},
{
"path": "audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv",
"chars": 34805,
"preview": "-5QrBL6MzLg_60.000_70.000.wav\t60.000\t70.000\tTrain horn\r\n-E0shPRxAbo_30.000_40.000.wav\t30.000\t40.000\tTrain horn\r\n-GCwoyCn"
},
{
"path": "audio_detection/audio_infer/metadata/class_labels_indices.csv",
"chars": 14675,
"preview": "index,mid,display_name\n0,/m/09x0r,\"Speech\"\n1,/m/05zppz,\"Male speech, man speaking\"\n2,/m/02zsn,\"Female speech, woman spea"
},
{
"path": "audio_detection/audio_infer/pytorch/evaluate.py",
"chars": 1113,
"preview": "from sklearn import metrics\n\nfrom pytorch_utils import forward\n\n\nclass Evaluator(object):\n def __init__(self, model):"
},
{
"path": "audio_detection/audio_infer/pytorch/finetune_template.py",
"chars": 4049,
"preview": "import os\nimport sys\nsys.path.insert(1, os.path.join(sys.path[0], '../utils'))\nimport numpy as np\nimport argparse\nimport"
},
{
"path": "audio_detection/audio_infer/pytorch/inference.py",
"chars": 7355,
"preview": "import os\nimport sys\nsys.path.insert(1, os.path.join(sys.path[0], '../utils'))\nimport numpy as np\nimport argparse\nimport"
},
{
"path": "audio_detection/audio_infer/pytorch/losses.py",
"chars": 313,
"preview": "import torch\nimport torch.nn.functional as F\n\n\ndef clip_bce(output_dict, target_dict):\n \"\"\"Binary crossentropy loss.\n"
},
{
"path": "audio_detection/audio_infer/pytorch/main.py",
"chars": 15068,
"preview": "import os\nimport sys\nsys.path.insert(1, os.path.join(sys.path[0], '../utils'))\nimport numpy as np\nimport argparse\nimport"
},
{
"path": "audio_detection/audio_infer/pytorch/models.py",
"chars": 38543,
"preview": "import torch\r\nimport torch.nn as nn\r\nimport torch.nn.functional as F\r\nfrom torchlibrosa.stft import Spectrogram, LogmelF"
},
{
"path": "audio_detection/audio_infer/pytorch/pytorch_utils.py",
"chars": 8309,
"preview": "import numpy as np\nimport time\nimport torch\nimport torch.nn as nn\n\n\ndef move_data_to_device(x, device):\n if 'float' i"
},
{
"path": "audio_detection/audio_infer/utils/config.py",
"chars": 5434,
"preview": "import numpy as np\nimport csv\n\nsample_rate = 32000\nclip_samples = sample_rate * 10 # Audio clips are 10-second\n\n# Lo"
},
{
"path": "audio_detection/audio_infer/utils/crash.py",
"chars": 366,
"preview": "import sys\n\nclass ExceptionHook:\n instance = None\n def __call__(self, *args, **kwargs):\n if self.instance i"
},
{
"path": "audio_detection/audio_infer/utils/create_black_list.py",
"chars": 1887,
"preview": "import argparse\nimport csv\nimport os\n\nfrom utilities import create_folder\n\n\ndef dcase2017task4(args):\n \"\"\"Create blac"
}
]
// ... and 162 more files (download for full content)
About this extraction
This page contains the full source code of the AIGC-Audio/AudioGPT GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 362 files (14.2 MB), approximately 3.7M tokens, and a symbol index with 3295 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.