Repository: PaddlePaddle/PaddleSpeech
Branch: develop
Commit: 3afe871a8768
Files: 3420
Total size: 15.9 MB
Directory structure:
gitextract_h_rw1s1r/
├── .clang-format
├── .flake8
├── .gitconfig
├── .github/
│ ├── CODE_OF_CONDUCT.md
│ ├── CONTRIBUTING.md
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug-report-s2t.md
│ │ ├── bug-report-tts.md
│ │ ├── feature-request.md
│ │ ├── others.md
│ │ └── question.md
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── stale.yml
├── .gitignore
├── .mergify.yml
├── .pre-commit-config.yaml
├── .pre-commit-hooks/
│ ├── clang-format.hook
│ └── copyright-check.hook
├── .readthedocs.yml
├── .style.yapf
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_cn.md
├── audio/
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── cmake/
│ │ ├── FindGFortranLibs.cmake
│ │ ├── external/
│ │ │ └── openblas.cmake
│ │ ├── pybind.cmake
│ │ └── summary.cmake
│ ├── paddleaudio/
│ │ ├── CMakeLists.txt
│ │ ├── __init__.py
│ │ ├── _extension.py
│ │ ├── _internal/
│ │ │ ├── __init__.py
│ │ │ └── module_utils.py
│ │ ├── backends/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── no_backend.py
│ │ │ ├── soundfile_backend.py
│ │ │ ├── sox_io_backend.py
│ │ │ └── utils.py
│ │ ├── compliance/
│ │ │ ├── __init__.py
│ │ │ ├── kaldi.py
│ │ │ └── librosa.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ ├── esc50.py
│ │ │ ├── gtzan.py
│ │ │ ├── hey_snips.py
│ │ │ ├── rirs_noises.py
│ │ │ ├── tess.py
│ │ │ ├── urban_sound.py
│ │ │ └── voxceleb.py
│ │ ├── features/
│ │ │ ├── __init__.py
│ │ │ └── layers.py
│ │ ├── functional/
│ │ │ ├── __init__.py
│ │ │ ├── functional.py
│ │ │ └── window.py
│ │ ├── kaldi/
│ │ │ ├── __init__.py
│ │ │ └── kaldi.py
│ │ ├── metric/
│ │ │ ├── __init__.py
│ │ │ └── eer.py
│ │ ├── sox_effects/
│ │ │ ├── __init__.py
│ │ │ └── sox_effects.py
│ │ ├── src/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── optional/
│ │ │ │ ├── COPYING
│ │ │ │ └── optional.hpp
│ │ │ ├── pybind/
│ │ │ │ ├── kaldi/
│ │ │ │ │ ├── feature_common.h
│ │ │ │ │ ├── feature_common_inl.h
│ │ │ │ │ ├── kaldi_feature.cc
│ │ │ │ │ ├── kaldi_feature.h
│ │ │ │ │ ├── kaldi_feature_wrapper.cc
│ │ │ │ │ └── kaldi_feature_wrapper.h
│ │ │ │ ├── pybind.cpp
│ │ │ │ └── sox/
│ │ │ │ ├── effects.cpp
│ │ │ │ ├── effects.h
│ │ │ │ ├── effects_chain.cpp
│ │ │ │ ├── effects_chain.h
│ │ │ │ ├── io.cpp
│ │ │ │ ├── io.h
│ │ │ │ ├── types.cpp
│ │ │ │ ├── types.h
│ │ │ │ ├── utils.cpp
│ │ │ │ └── utils.h
│ │ │ └── utils.cpp
│ │ ├── third_party/
│ │ │ ├── .gitignore
│ │ │ ├── CMakeLists.txt
│ │ │ ├── kaldi-native-fbank/
│ │ │ │ └── csrc/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── feature-fbank.cc
│ │ │ │ ├── feature-fbank.h
│ │ │ │ ├── feature-functions.cc
│ │ │ │ ├── feature-functions.h
│ │ │ │ ├── feature-window.cc
│ │ │ │ ├── feature-window.h
│ │ │ │ ├── fftsg.c
│ │ │ │ ├── log.cc
│ │ │ │ ├── log.h
│ │ │ │ ├── mel-computations.cc
│ │ │ │ ├── mel-computations.h
│ │ │ │ ├── rfft.cc
│ │ │ │ └── rfft.h
│ │ │ ├── patches/
│ │ │ │ ├── config.guess
│ │ │ │ ├── config.sub
│ │ │ │ ├── libmad.patch
│ │ │ │ └── sox.patch
│ │ │ └── sox/
│ │ │ └── CMakeLists.txt
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── download.py
│ │ ├── env.py
│ │ ├── error.py
│ │ ├── log.py
│ │ ├── numeric.py
│ │ ├── sox_utils.py
│ │ ├── tensor_utils.py
│ │ └── time.py
│ ├── setup.py
│ ├── tests/
│ │ ├── backends/
│ │ │ ├── base.py
│ │ │ ├── common.py
│ │ │ ├── soundfile/
│ │ │ │ ├── base.py
│ │ │ │ ├── common.py
│ │ │ │ ├── info_test.py
│ │ │ │ ├── load_test.py
│ │ │ │ ├── save_test.py
│ │ │ │ └── test_io.py
│ │ │ └── sox_io/
│ │ │ ├── common.py
│ │ │ ├── info_test.py
│ │ │ ├── load_test.py
│ │ │ ├── save_test.py
│ │ │ ├── smoke_test.py
│ │ │ ├── sox_effect_test.py
│ │ │ └── sox_effect_test_args.jsonl
│ │ ├── benchmark/
│ │ │ ├── README.md
│ │ │ ├── log_melspectrogram.py
│ │ │ ├── melspectrogram.py
│ │ │ └── mfcc.py
│ │ ├── common_utils/
│ │ │ ├── __init__.py
│ │ │ ├── case_utils.py
│ │ │ ├── data_utils.py
│ │ │ ├── parameterized_utils.py
│ │ │ ├── sox_utils.py
│ │ │ └── wav_utils.py
│ │ └── features/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── test_istft.py
│ │ ├── test_kaldi.py
│ │ ├── test_kaldi_feat.py
│ │ ├── test_librosa.py
│ │ ├── test_log_melspectrogram.py
│ │ ├── test_spectrogram.py
│ │ ├── test_stft.py
│ │ └── testdata/
│ │ ├── fbank_feat.ark
│ │ ├── fbank_feat_txt.ark
│ │ ├── pitch_feat.ark
│ │ └── pitch_feat_txt.ark
│ └── tools/
│ └── setup_helpers/
│ ├── __init__.py
│ └── extension.py
├── dataset/
│ ├── aishell/
│ │ ├── .gitignore
│ │ └── aishell.py
│ ├── aishell3/
│ │ └── README.md
│ ├── chime3_background/
│ │ └── chime3_background.py
│ ├── gigaspeech/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── gigaspeech.py
│ │ └── run.sh
│ ├── librispeech/
│ │ ├── .gitignore
│ │ └── librispeech.py
│ ├── magicdata/
│ │ └── README.md
│ ├── mini_librispeech/
│ │ ├── .gitignore
│ │ └── mini_librispeech.py
│ ├── multi_cn/
│ │ └── README.md
│ ├── primewords/
│ │ └── README.md
│ ├── rir_noise/
│ │ ├── .gitignore
│ │ └── rir_noise.py
│ ├── st-cmds/
│ │ └── README.md
│ ├── tal_cs/
│ │ ├── README.md
│ │ └── tal_cs.py
│ ├── ted_en_zh/
│ │ ├── .gitignore
│ │ └── ted_en_zh.py
│ ├── thchs30/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ └── thchs30.py
│ ├── timit/
│ │ ├── .gitignore
│ │ ├── timit.py
│ │ └── timit_kaldi_standard_split.py
│ ├── voxceleb/
│ │ ├── README.md
│ │ ├── voxceleb1.py
│ │ └── voxceleb2.py
│ └── voxforge/
│ ├── run_data.sh
│ └── voxforge.py
├── demos/
│ ├── README.md
│ ├── README_cn.md
│ ├── TTSAndroid/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── app/
│ │ │ ├── .gitignore
│ │ │ ├── build.gradle
│ │ │ ├── proguard-rules.pro
│ │ │ └── src/
│ │ │ ├── androidTest/
│ │ │ │ └── java/
│ │ │ │ └── com/
│ │ │ │ └── baidu/
│ │ │ │ └── paddle/
│ │ │ │ └── lite/
│ │ │ │ └── demo/
│ │ │ │ └── tts/
│ │ │ │ └── ExampleInstrumentedTest.java
│ │ │ ├── main/
│ │ │ │ ├── AndroidManifest.xml
│ │ │ │ ├── java/
│ │ │ │ │ └── com/
│ │ │ │ │ └── baidu/
│ │ │ │ │ └── paddle/
│ │ │ │ │ └── lite/
│ │ │ │ │ └── demo/
│ │ │ │ │ └── tts/
│ │ │ │ │ ├── AppCompatPreferenceActivity.java
│ │ │ │ │ ├── MainActivity.java
│ │ │ │ │ ├── Predictor.java
│ │ │ │ │ ├── SettingsActivity.java
│ │ │ │ │ └── Utils.java
│ │ │ │ └── res/
│ │ │ │ ├── drawable/
│ │ │ │ │ └── button_drawable.xml
│ │ │ │ ├── layout/
│ │ │ │ │ └── activity_main.xml
│ │ │ │ ├── menu/
│ │ │ │ │ └── menu_action_options.xml
│ │ │ │ ├── values/
│ │ │ │ │ ├── arrays.xml
│ │ │ │ │ ├── colors.xml
│ │ │ │ │ ├── strings.xml
│ │ │ │ │ └── styles.xml
│ │ │ │ └── xml/
│ │ │ │ └── settings.xml
│ │ │ └── test/
│ │ │ └── java/
│ │ │ └── com/
│ │ │ └── baidu/
│ │ │ └── paddle/
│ │ │ └── lite/
│ │ │ └── demo/
│ │ │ └── tts/
│ │ │ └── ExampleUnitTest.java
│ │ ├── build.gradle
│ │ ├── gradle/
│ │ │ └── wrapper/
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ │ ├── gradle.properties
│ │ ├── gradlew
│ │ ├── gradlew.bat
│ │ └── settings.gradle
│ ├── TTSArmLinux/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── build.sh
│ │ ├── clean.sh
│ │ ├── config.sh
│ │ ├── download.sh
│ │ ├── front.conf
│ │ ├── run.sh
│ │ └── src/
│ │ ├── CMakeLists.txt
│ │ ├── Predictor.hpp
│ │ └── main.cc
│ ├── TTSCppFrontend/
│ │ ├── .gitignore
│ │ ├── CMakeLists.txt
│ │ ├── README.md
│ │ ├── build-depends.sh
│ │ ├── build.sh
│ │ ├── clean.sh
│ │ ├── download.sh
│ │ ├── front_demo/
│ │ │ ├── front.conf
│ │ │ ├── front_demo.cpp
│ │ │ └── gentools/
│ │ │ ├── gen_dict_paddlespeech.py
│ │ │ ├── genid.py
│ │ │ └── word2phones.py
│ │ ├── run_front_demo.sh
│ │ ├── src/
│ │ │ ├── base/
│ │ │ │ ├── type_conv.cpp
│ │ │ │ └── type_conv.h
│ │ │ └── front/
│ │ │ ├── front_interface.cpp
│ │ │ ├── front_interface.h
│ │ │ ├── text_normalize.cpp
│ │ │ └── text_normalize.h
│ │ └── third-party/
│ │ └── CMakeLists.txt
│ ├── asr_deployment/
│ │ ├── README.md
│ │ └── README_cn.md
│ ├── audio_content_search/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── acs_clinet.py
│ │ ├── conf/
│ │ │ ├── acs_application.yaml
│ │ │ ├── words.txt
│ │ │ ├── ws_conformer_application.yaml
│ │ │ └── ws_conformer_wenetspeech_application.yaml
│ │ ├── requirements.txt
│ │ ├── run.sh
│ │ └── streaming_asr_server.py
│ ├── audio_searching/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── docker-compose.yaml
│ │ ├── requirements.txt
│ │ └── src/
│ │ ├── audio_search.py
│ │ ├── config.py
│ │ ├── encode.py
│ │ ├── logs.py
│ │ ├── milvus_helpers.py
│ │ ├── mysql_helpers.py
│ │ ├── operations/
│ │ │ ├── __init__.py
│ │ │ ├── count.py
│ │ │ ├── drop.py
│ │ │ ├── load.py
│ │ │ └── search.py
│ │ ├── test_audio_search.py
│ │ ├── test_vpr_search.py
│ │ └── vpr_search.py
│ ├── audio_tagging/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── automatic_video_subtitiles/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── recognize.py
│ │ └── run.sh
│ ├── custom_streaming_asr/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── path.sh
│ │ ├── setup_docker.sh
│ │ ├── websocket_client.sh
│ │ └── websocket_server.sh
│ ├── keyword_spotting/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── metaverse/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── path.sh
│ │ ├── run.sh
│ │ └── sentences.txt
│ ├── punctuation_restoration/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── speaker_verification/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── speech_recognition/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── speech_server/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── asr_client.sh
│ │ ├── cls_client.sh
│ │ ├── conf/
│ │ │ ├── application.yaml
│ │ │ └── conformer_talcs_application.yaml
│ │ ├── server.sh
│ │ ├── sid_client.sh
│ │ ├── start_multi_progress_server.py
│ │ ├── text_client.sh
│ │ └── tts_client.sh
│ ├── speech_ssl/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── speech_translation/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ ├── speech_web/
│ │ ├── .gitignore
│ │ ├── API.md
│ │ ├── README.md
│ │ ├── speech_server/
│ │ │ ├── conf/
│ │ │ │ ├── tts3_finetune.yaml
│ │ │ │ ├── tts_online_application.yaml
│ │ │ │ └── ws_conformer_wenetspeech_application_faster.yaml
│ │ │ ├── main.py
│ │ │ ├── requirements.txt
│ │ │ ├── src/
│ │ │ │ ├── AudioManeger.py
│ │ │ │ ├── SpeechBase/
│ │ │ │ │ ├── asr.py
│ │ │ │ │ ├── nlp.py
│ │ │ │ │ ├── sql_helper.py
│ │ │ │ │ ├── tts.py
│ │ │ │ │ ├── vpr.py
│ │ │ │ │ └── vpr_encode.py
│ │ │ │ ├── WebsocketManeger.py
│ │ │ │ ├── ernie_sat.py
│ │ │ │ ├── finetune.py
│ │ │ │ ├── ge2e_clone.py
│ │ │ │ ├── robot.py
│ │ │ │ ├── tdnn_clone.py
│ │ │ │ └── util.py
│ │ │ └── vc.py
│ │ └── web_client/
│ │ ├── .gitignore
│ │ ├── index.html
│ │ ├── package.json
│ │ ├── src/
│ │ │ ├── App.vue
│ │ │ ├── api/
│ │ │ │ ├── API.js
│ │ │ │ ├── ApiASR.js
│ │ │ │ ├── ApiNLP.js
│ │ │ │ ├── ApiTTS.js
│ │ │ │ ├── ApiVC.js
│ │ │ │ └── ApiVPR.js
│ │ │ ├── components/
│ │ │ │ ├── Content/
│ │ │ │ │ ├── Header/
│ │ │ │ │ │ ├── Header.vue
│ │ │ │ │ │ └── style.less
│ │ │ │ │ └── Tail/
│ │ │ │ │ ├── Tail.vue
│ │ │ │ │ └── style.less
│ │ │ │ ├── Experience.vue
│ │ │ │ ├── SubMenu/
│ │ │ │ │ ├── ASR/
│ │ │ │ │ │ ├── ASR.vue
│ │ │ │ │ │ ├── ASRT.vue
│ │ │ │ │ │ ├── AudioFile/
│ │ │ │ │ │ │ ├── AudioFileIdentification.vue
│ │ │ │ │ │ │ └── style.less
│ │ │ │ │ │ ├── EndToEnd/
│ │ │ │ │ │ │ ├── EndToEndIdentification.vue
│ │ │ │ │ │ │ └── style.less
│ │ │ │ │ │ ├── RealTime/
│ │ │ │ │ │ │ ├── RealTime.vue
│ │ │ │ │ │ │ └── style.less
│ │ │ │ │ │ └── style.less
│ │ │ │ │ ├── ChatBot/
│ │ │ │ │ │ ├── ChatT.vue
│ │ │ │ │ │ └── style.less
│ │ │ │ │ ├── ERNIE_SAT/
│ │ │ │ │ │ └── ERNIE_SAT.vue
│ │ │ │ │ ├── FineTune/
│ │ │ │ │ │ └── FineTune.vue
│ │ │ │ │ ├── IE/
│ │ │ │ │ │ ├── IET.vue
│ │ │ │ │ │ └── style.less
│ │ │ │ │ ├── TTS/
│ │ │ │ │ │ ├── TTST.vue
│ │ │ │ │ │ └── style.less
│ │ │ │ │ ├── VPR/
│ │ │ │ │ │ ├── VPRT.vue
│ │ │ │ │ │ └── style.less
│ │ │ │ │ └── VoiceClone/
│ │ │ │ │ └── VoiceClone.vue
│ │ │ │ └── style.less
│ │ │ └── main.js
│ │ └── vite.config.js
│ ├── story_talker/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── ocr.py
│ │ ├── path.sh
│ │ └── run.sh
│ ├── streaming_asr_server/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── conf/
│ │ │ ├── application.yaml
│ │ │ ├── punc_application.yaml
│ │ │ ├── ws_conformer_application.yaml
│ │ │ ├── ws_conformer_talcs_application.yaml
│ │ │ ├── ws_conformer_wenetspeech_application.yaml
│ │ │ ├── ws_conformer_wenetspeech_application_faster.yaml
│ │ │ └── ws_ds2_application.yaml
│ │ ├── local/
│ │ │ ├── punc_server.py
│ │ │ ├── rtf_from_log.py
│ │ │ ├── streaming_asr_server.py
│ │ │ ├── test.sh
│ │ │ ├── websocket_client.py
│ │ │ └── websocket_client_srt.py
│ │ ├── run.sh
│ │ ├── server.sh
│ │ ├── test.sh
│ │ └── web/
│ │ ├── index.html
│ │ └── readme.md
│ ├── streaming_tts_server/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── client.sh
│ │ ├── conf/
│ │ │ ├── tts_online_application.yaml
│ │ │ └── tts_online_ws_application.yaml
│ │ └── server.sh
│ ├── streaming_tts_serving_fastdeploy/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── streaming_tts_serving/
│ │ ├── 1/
│ │ │ └── model.py
│ │ ├── config.pbtxt
│ │ └── stream_client.py
│ ├── style_fs2/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── path.sh
│ │ ├── run.sh
│ │ ├── sentences.txt
│ │ └── style_syn.py
│ ├── text_to_speech/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ └── run.sh
│ └── whisper/
│ ├── README.md
│ ├── README_cn.md
│ └── run.sh
├── docker/
│ ├── ubuntu16-gpu/
│ │ └── Dockerfile
│ ├── ubuntu18-cpu/
│ │ └── Dockerfile
│ └── ubuntu20-cpu/
│ └── Dockerfile
├── docs/
│ ├── Makefile
│ ├── requirements.txt
│ ├── source/
│ │ ├── _static/
│ │ │ └── custom.css
│ │ ├── api/
│ │ │ ├── modules.rst
│ │ │ ├── paddlespeech.audio.features.layers.rst
│ │ │ ├── paddlespeech.audio.features.rst
│ │ │ ├── paddlespeech.audio.io.rst
│ │ │ ├── paddlespeech.audio.rst
│ │ │ ├── paddlespeech.audio.streamdata.autodecode.rst
│ │ │ ├── paddlespeech.audio.streamdata.cache.rst
│ │ │ ├── paddlespeech.audio.streamdata.compat.rst
│ │ │ ├── paddlespeech.audio.streamdata.extradatasets.rst
│ │ │ ├── paddlespeech.audio.streamdata.filters.rst
│ │ │ ├── paddlespeech.audio.streamdata.gopen.rst
│ │ │ ├── paddlespeech.audio.streamdata.handlers.rst
│ │ │ ├── paddlespeech.audio.streamdata.mix.rst
│ │ │ ├── paddlespeech.audio.streamdata.paddle_utils.rst
│ │ │ ├── paddlespeech.audio.streamdata.pipeline.rst
│ │ │ ├── paddlespeech.audio.streamdata.rst
│ │ │ ├── paddlespeech.audio.streamdata.shardlists.rst
│ │ │ ├── paddlespeech.audio.streamdata.tariterators.rst
│ │ │ ├── paddlespeech.audio.streamdata.utils.rst
│ │ │ ├── paddlespeech.audio.streamdata.writer.rst
│ │ │ ├── paddlespeech.audio.text.rst
│ │ │ ├── paddlespeech.audio.text.text_featurizer.rst
│ │ │ ├── paddlespeech.audio.text.utility.rst
│ │ │ ├── paddlespeech.audio.transform.add_deltas.rst
│ │ │ ├── paddlespeech.audio.transform.channel_selector.rst
│ │ │ ├── paddlespeech.audio.transform.cmvn.rst
│ │ │ ├── paddlespeech.audio.transform.functional.rst
│ │ │ ├── paddlespeech.audio.transform.perturb.rst
│ │ │ ├── paddlespeech.audio.transform.rst
│ │ │ ├── paddlespeech.audio.transform.spec_augment.rst
│ │ │ ├── paddlespeech.audio.transform.spectrogram.rst
│ │ │ ├── paddlespeech.audio.transform.transform_interface.rst
│ │ │ ├── paddlespeech.audio.transform.transformation.rst
│ │ │ ├── paddlespeech.audio.transform.wpe.rst
│ │ │ ├── paddlespeech.audio.utils.check_kwargs.rst
│ │ │ ├── paddlespeech.audio.utils.download.rst
│ │ │ ├── paddlespeech.audio.utils.dynamic_import.rst
│ │ │ ├── paddlespeech.audio.utils.error.rst
│ │ │ ├── paddlespeech.audio.utils.log.rst
│ │ │ ├── paddlespeech.audio.utils.numeric.rst
│ │ │ ├── paddlespeech.audio.utils.rst
│ │ │ ├── paddlespeech.audio.utils.tensor_utils.rst
│ │ │ ├── paddlespeech.audio.utils.time.rst
│ │ │ ├── paddlespeech.cli.asr.infer.rst
│ │ │ ├── paddlespeech.cli.asr.rst
│ │ │ ├── paddlespeech.cli.base_commands.rst
│ │ │ ├── paddlespeech.cli.cls.infer.rst
│ │ │ ├── paddlespeech.cli.cls.rst
│ │ │ ├── paddlespeech.cli.download.rst
│ │ │ ├── paddlespeech.cli.entry.rst
│ │ │ ├── paddlespeech.cli.executor.rst
│ │ │ ├── paddlespeech.cli.kws.infer.rst
│ │ │ ├── paddlespeech.cli.kws.rst
│ │ │ ├── paddlespeech.cli.log.rst
│ │ │ ├── paddlespeech.cli.rst
│ │ │ ├── paddlespeech.cli.st.infer.rst
│ │ │ ├── paddlespeech.cli.st.rst
│ │ │ ├── paddlespeech.cli.text.infer.rst
│ │ │ ├── paddlespeech.cli.text.rst
│ │ │ ├── paddlespeech.cli.tts.infer.rst
│ │ │ ├── paddlespeech.cli.tts.rst
│ │ │ ├── paddlespeech.cli.utils.rst
│ │ │ ├── paddlespeech.cli.vector.infer.rst
│ │ │ ├── paddlespeech.cli.vector.rst
│ │ │ ├── paddlespeech.cls.exps.panns.deploy.rst
│ │ │ ├── paddlespeech.cls.exps.panns.rst
│ │ │ ├── paddlespeech.cls.exps.rst
│ │ │ ├── paddlespeech.cls.models.panns.classifier.rst
│ │ │ ├── paddlespeech.cls.models.panns.panns.rst
│ │ │ ├── paddlespeech.cls.models.panns.rst
│ │ │ ├── paddlespeech.cls.models.rst
│ │ │ ├── paddlespeech.cls.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.collate.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.compute_det.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.plot_det_curve.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.score.rst
│ │ │ ├── paddlespeech.kws.exps.mdtc.train.rst
│ │ │ ├── paddlespeech.kws.exps.rst
│ │ │ ├── paddlespeech.kws.models.loss.rst
│ │ │ ├── paddlespeech.kws.models.mdtc.rst
│ │ │ ├── paddlespeech.kws.models.rst
│ │ │ ├── paddlespeech.kws.rst
│ │ │ ├── paddlespeech.resource.model_alias.rst
│ │ │ ├── paddlespeech.resource.pretrained_models.rst
│ │ │ ├── paddlespeech.resource.resource.rst
│ │ │ ├── paddlespeech.resource.rst
│ │ │ ├── paddlespeech.rst
│ │ │ ├── paddlespeech.s2t.decoders.beam_search.batch_beam_search.rst
│ │ │ ├── paddlespeech.s2t.decoders.beam_search.beam_search.rst
│ │ │ ├── paddlespeech.s2t.decoders.beam_search.rst
│ │ │ ├── paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated.rst
│ │ │ ├── paddlespeech.s2t.decoders.ctcdecoder.rst
│ │ │ ├── paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper.rst
│ │ │ ├── paddlespeech.s2t.decoders.recog.rst
│ │ │ ├── paddlespeech.s2t.decoders.rst
│ │ │ ├── paddlespeech.s2t.decoders.scorers.ctc.rst
│ │ │ ├── paddlespeech.s2t.decoders.scorers.ctc_prefix_score.rst
│ │ │ ├── paddlespeech.s2t.decoders.scorers.length_bonus.rst
│ │ │ ├── paddlespeech.s2t.decoders.scorers.rst
│ │ │ ├── paddlespeech.s2t.decoders.scorers.scorer_interface.rst
│ │ │ ├── paddlespeech.s2t.decoders.utils.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.deploy.server.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.export.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.test.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.test_export.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.test_wav.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.bin.train.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.model.rst
│ │ │ ├── paddlespeech.s2t.exps.deepspeech2.rst
│ │ │ ├── paddlespeech.s2t.exps.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.alignment.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.export.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.test.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.test_wav.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.bin.train.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.model.rst
│ │ │ ├── paddlespeech.s2t.exps.u2.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_kaldi.bin.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_kaldi.bin.test.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_kaldi.bin.train.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_kaldi.model.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_kaldi.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.bin.export.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.bin.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.bin.test.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.bin.train.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.model.rst
│ │ │ ├── paddlespeech.s2t.exps.u2_st.rst
│ │ │ ├── paddlespeech.s2t.frontend.audio.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.augmentation.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.base.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.impulse_response.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.noise_perturb.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.online_bayesian_normalization.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.resample.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.shift_perturb.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.spec_augment.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.speed_perturb.rst
│ │ │ ├── paddlespeech.s2t.frontend.augmentor.volume_perturb.rst
│ │ │ ├── paddlespeech.s2t.frontend.featurizer.audio_featurizer.rst
│ │ │ ├── paddlespeech.s2t.frontend.featurizer.rst
│ │ │ ├── paddlespeech.s2t.frontend.featurizer.speech_featurizer.rst
│ │ │ ├── paddlespeech.s2t.frontend.featurizer.text_featurizer.rst
│ │ │ ├── paddlespeech.s2t.frontend.normalizer.rst
│ │ │ ├── paddlespeech.s2t.frontend.rst
│ │ │ ├── paddlespeech.s2t.frontend.speech.rst
│ │ │ ├── paddlespeech.s2t.frontend.utility.rst
│ │ │ ├── paddlespeech.s2t.io.batchfy.rst
│ │ │ ├── paddlespeech.s2t.io.collator.rst
│ │ │ ├── paddlespeech.s2t.io.converter.rst
│ │ │ ├── paddlespeech.s2t.io.dataloader.rst
│ │ │ ├── paddlespeech.s2t.io.dataset.rst
│ │ │ ├── paddlespeech.s2t.io.reader.rst
│ │ │ ├── paddlespeech.s2t.io.rst
│ │ │ ├── paddlespeech.s2t.io.sampler.rst
│ │ │ ├── paddlespeech.s2t.io.utility.rst
│ │ │ ├── paddlespeech.s2t.models.asr_interface.rst
│ │ │ ├── paddlespeech.s2t.models.ds2.conv.rst
│ │ │ ├── paddlespeech.s2t.models.ds2.deepspeech2.rst
│ │ │ ├── paddlespeech.s2t.models.ds2.rst
│ │ │ ├── paddlespeech.s2t.models.lm.dataset.rst
│ │ │ ├── paddlespeech.s2t.models.lm.rst
│ │ │ ├── paddlespeech.s2t.models.lm.transformer.rst
│ │ │ ├── paddlespeech.s2t.models.lm_interface.rst
│ │ │ ├── paddlespeech.s2t.models.rst
│ │ │ ├── paddlespeech.s2t.models.st_interface.rst
│ │ │ ├── paddlespeech.s2t.models.u2.rst
│ │ │ ├── paddlespeech.s2t.models.u2.u2.rst
│ │ │ ├── paddlespeech.s2t.models.u2.updater.rst
│ │ │ ├── paddlespeech.s2t.models.u2_st.rst
│ │ │ ├── paddlespeech.s2t.models.u2_st.u2_st.rst
│ │ │ ├── paddlespeech.s2t.modules.activation.rst
│ │ │ ├── paddlespeech.s2t.modules.align.rst
│ │ │ ├── paddlespeech.s2t.modules.attention.rst
│ │ │ ├── paddlespeech.s2t.modules.cmvn.rst
│ │ │ ├── paddlespeech.s2t.modules.conformer_convolution.rst
│ │ │ ├── paddlespeech.s2t.modules.crf.rst
│ │ │ ├── paddlespeech.s2t.modules.ctc.rst
│ │ │ ├── paddlespeech.s2t.modules.decoder.rst
│ │ │ ├── paddlespeech.s2t.modules.decoder_layer.rst
│ │ │ ├── paddlespeech.s2t.modules.embedding.rst
│ │ │ ├── paddlespeech.s2t.modules.encoder.rst
│ │ │ ├── paddlespeech.s2t.modules.encoder_layer.rst
│ │ │ ├── paddlespeech.s2t.modules.initializer.rst
│ │ │ ├── paddlespeech.s2t.modules.loss.rst
│ │ │ ├── paddlespeech.s2t.modules.mask.rst
│ │ │ ├── paddlespeech.s2t.modules.positionwise_feed_forward.rst
│ │ │ ├── paddlespeech.s2t.modules.rst
│ │ │ ├── paddlespeech.s2t.modules.subsampling.rst
│ │ │ ├── paddlespeech.s2t.rst
│ │ │ ├── paddlespeech.s2t.training.cli.rst
│ │ │ ├── paddlespeech.s2t.training.extensions.evaluator.rst
│ │ │ ├── paddlespeech.s2t.training.extensions.extension.rst
│ │ │ ├── paddlespeech.s2t.training.extensions.plot.rst
│ │ │ ├── paddlespeech.s2t.training.extensions.rst
│ │ │ ├── paddlespeech.s2t.training.gradclip.rst
│ │ │ ├── paddlespeech.s2t.training.optimizer.rst
│ │ │ ├── paddlespeech.s2t.training.reporter.rst
│ │ │ ├── paddlespeech.s2t.training.rst
│ │ │ ├── paddlespeech.s2t.training.scheduler.rst
│ │ │ ├── paddlespeech.s2t.training.timer.rst
│ │ │ ├── paddlespeech.s2t.training.trainer.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.compare_value_trigger.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.interval_trigger.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.limit_trigger.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.time_trigger.rst
│ │ │ ├── paddlespeech.s2t.training.triggers.utils.rst
│ │ │ ├── paddlespeech.s2t.training.updaters.rst
│ │ │ ├── paddlespeech.s2t.training.updaters.standard_updater.rst
│ │ │ ├── paddlespeech.s2t.training.updaters.updater.rst
│ │ │ ├── paddlespeech.s2t.utils.asr_utils.rst
│ │ │ ├── paddlespeech.s2t.utils.bleu_score.rst
│ │ │ ├── paddlespeech.s2t.utils.check_kwargs.rst
│ │ │ ├── paddlespeech.s2t.utils.checkpoint.rst
│ │ │ ├── paddlespeech.s2t.utils.cli_readers.rst
│ │ │ ├── paddlespeech.s2t.utils.cli_utils.rst
│ │ │ ├── paddlespeech.s2t.utils.cli_writers.rst
│ │ │ ├── paddlespeech.s2t.utils.ctc_utils.rst
│ │ │ ├── paddlespeech.s2t.utils.dynamic_import.rst
│ │ │ ├── paddlespeech.s2t.utils.dynamic_pip_install.rst
│ │ │ ├── paddlespeech.s2t.utils.error_rate.rst
│ │ │ ├── paddlespeech.s2t.utils.layer_tools.rst
│ │ │ ├── paddlespeech.s2t.utils.log.rst
│ │ │ ├── paddlespeech.s2t.utils.mp_tools.rst
│ │ │ ├── paddlespeech.s2t.utils.profiler.rst
│ │ │ ├── paddlespeech.s2t.utils.rst
│ │ │ ├── paddlespeech.s2t.utils.socket_server.rst
│ │ │ ├── paddlespeech.s2t.utils.spec_augment.rst
│ │ │ ├── paddlespeech.s2t.utils.tensor_utils.rst
│ │ │ ├── paddlespeech.s2t.utils.text_grid.rst
│ │ │ ├── paddlespeech.s2t.utils.utility.rst
│ │ │ ├── paddlespeech.server.base_commands.rst
│ │ │ ├── paddlespeech.server.bin.paddlespeech_client.rst
│ │ │ ├── paddlespeech.server.bin.paddlespeech_server.rst
│ │ │ ├── paddlespeech.server.bin.rst
│ │ │ ├── paddlespeech.server.engine.acs.python.rst
│ │ │ ├── paddlespeech.server.engine.acs.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.ctc_endpoint.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.ctc_search.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.onnx.asr_engine.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.onnx.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.paddleinference.asr_engine.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.paddleinference.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.python.asr_engine.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.python.rst
│ │ │ ├── paddlespeech.server.engine.asr.online.rst
│ │ │ ├── paddlespeech.server.engine.asr.paddleinference.asr_engine.rst
│ │ │ ├── paddlespeech.server.engine.asr.paddleinference.rst
│ │ │ ├── paddlespeech.server.engine.asr.python.asr_engine.rst
│ │ │ ├── paddlespeech.server.engine.asr.python.rst
│ │ │ ├── paddlespeech.server.engine.asr.rst
│ │ │ ├── paddlespeech.server.engine.base_engine.rst
│ │ │ ├── paddlespeech.server.engine.cls.paddleinference.cls_engine.rst
│ │ │ ├── paddlespeech.server.engine.cls.paddleinference.rst
│ │ │ ├── paddlespeech.server.engine.cls.python.cls_engine.rst
│ │ │ ├── paddlespeech.server.engine.cls.python.rst
│ │ │ ├── paddlespeech.server.engine.cls.rst
│ │ │ ├── paddlespeech.server.engine.engine_factory.rst
│ │ │ ├── paddlespeech.server.engine.engine_pool.rst
│ │ │ ├── paddlespeech.server.engine.engine_warmup.rst
│ │ │ ├── paddlespeech.server.engine.rst
│ │ │ ├── paddlespeech.server.engine.text.python.rst
│ │ │ ├── paddlespeech.server.engine.text.python.text_engine.rst
│ │ │ ├── paddlespeech.server.engine.text.rst
│ │ │ ├── paddlespeech.server.engine.tts.online.onnx.rst
│ │ │ ├── paddlespeech.server.engine.tts.online.onnx.tts_engine.rst
│ │ │ ├── paddlespeech.server.engine.tts.online.python.rst
│ │ │ ├── paddlespeech.server.engine.tts.online.python.tts_engine.rst
│ │ │ ├── paddlespeech.server.engine.tts.online.rst
│ │ │ ├── paddlespeech.server.engine.tts.paddleinference.rst
│ │ │ ├── paddlespeech.server.engine.tts.paddleinference.tts_engine.rst
│ │ │ ├── paddlespeech.server.engine.tts.python.rst
│ │ │ ├── paddlespeech.server.engine.tts.python.tts_engine.rst
│ │ │ ├── paddlespeech.server.engine.tts.rst
│ │ │ ├── paddlespeech.server.engine.vector.python.rst
│ │ │ ├── paddlespeech.server.engine.vector.python.vector_engine.rst
│ │ │ ├── paddlespeech.server.engine.vector.rst
│ │ │ ├── paddlespeech.server.entry.rst
│ │ │ ├── paddlespeech.server.executor.rst
│ │ │ ├── paddlespeech.server.restful.acs_api.rst
│ │ │ ├── paddlespeech.server.restful.api.rst
│ │ │ ├── paddlespeech.server.restful.asr_api.rst
│ │ │ ├── paddlespeech.server.restful.cls_api.rst
│ │ │ ├── paddlespeech.server.restful.request.rst
│ │ │ ├── paddlespeech.server.restful.response.rst
│ │ │ ├── paddlespeech.server.restful.rst
│ │ │ ├── paddlespeech.server.restful.text_api.rst
│ │ │ ├── paddlespeech.server.restful.tts_api.rst
│ │ │ ├── paddlespeech.server.restful.vector_api.rst
│ │ │ ├── paddlespeech.server.rst
│ │ │ ├── paddlespeech.server.tests.asr.offline.http_client.rst
│ │ │ ├── paddlespeech.server.tests.asr.offline.rst
│ │ │ ├── paddlespeech.server.tests.asr.rst
│ │ │ ├── paddlespeech.server.tests.rst
│ │ │ ├── paddlespeech.server.util.rst
│ │ │ ├── paddlespeech.server.utils.audio_handler.rst
│ │ │ ├── paddlespeech.server.utils.audio_process.rst
│ │ │ ├── paddlespeech.server.utils.buffer.rst
│ │ │ ├── paddlespeech.server.utils.config.rst
│ │ │ ├── paddlespeech.server.utils.errors.rst
│ │ │ ├── paddlespeech.server.utils.exception.rst
│ │ │ ├── paddlespeech.server.utils.onnx_infer.rst
│ │ │ ├── paddlespeech.server.utils.paddle_predictor.rst
│ │ │ ├── paddlespeech.server.utils.rst
│ │ │ ├── paddlespeech.server.utils.util.rst
│ │ │ ├── paddlespeech.server.utils.vad.rst
│ │ │ ├── paddlespeech.server.ws.api.rst
│ │ │ ├── paddlespeech.server.ws.asr_api.rst
│ │ │ ├── paddlespeech.server.ws.rst
│ │ │ ├── paddlespeech.server.ws.tts_api.rst
│ │ │ ├── paddlespeech.t2s.audio.audio.rst
│ │ │ ├── paddlespeech.t2s.audio.codec.rst
│ │ │ ├── paddlespeech.t2s.audio.rst
│ │ │ ├── paddlespeech.t2s.audio.spec_normalizer.rst
│ │ │ ├── paddlespeech.t2s.datasets.am_batch_fn.rst
│ │ │ ├── paddlespeech.t2s.datasets.batch.rst
│ │ │ ├── paddlespeech.t2s.datasets.data_table.rst
│ │ │ ├── paddlespeech.t2s.datasets.dataset.rst
│ │ │ ├── paddlespeech.t2s.datasets.get_feats.rst
│ │ │ ├── paddlespeech.t2s.datasets.ljspeech.rst
│ │ │ ├── paddlespeech.t2s.datasets.preprocess_utils.rst
│ │ │ ├── paddlespeech.t2s.datasets.rst
│ │ │ ├── paddlespeech.t2s.datasets.sampler.rst
│ │ │ ├── paddlespeech.t2s.datasets.vocoder_batch_fn.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.align.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.train.rst
│ │ │ ├── paddlespeech.t2s.exps.ernie_sat.utils.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.gen_gta_mel.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.train.rst
│ │ │ ├── paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.hifigan.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.hifigan.train.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.train.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.synthesize_from_wav.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.train.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.style_melgan.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.style_melgan.train.rst
│ │ │ ├── paddlespeech.t2s.exps.gan_vocoder.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.inference.rst
│ │ │ ├── paddlespeech.t2s.exps.inference_streaming.rst
│ │ │ ├── paddlespeech.t2s.exps.ort_predict.rst
│ │ │ ├── paddlespeech.t2s.exps.ort_predict_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.ort_predict_streaming.rst
│ │ │ ├── paddlespeech.t2s.exps.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.gen_gta_mel.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.inference.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.synthesize_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.speedyspeech.train.rst
│ │ │ ├── paddlespeech.t2s.exps.stream_play_tts.rst
│ │ │ ├── paddlespeech.t2s.exps.syn_utils.rst
│ │ │ ├── paddlespeech.t2s.exps.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.synthesize_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.synthesize_streaming.rst
│ │ │ ├── paddlespeech.t2s.exps.tacotron2.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.tacotron2.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.tacotron2.rst
│ │ │ ├── paddlespeech.t2s.exps.tacotron2.train.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.synthesize_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.transformer_tts.train.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.normalize.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.synthesize_e2e.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.train.rst
│ │ │ ├── paddlespeech.t2s.exps.vits.voice_cloning.rst
│ │ │ ├── paddlespeech.t2s.exps.voice_cloning.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.config.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.ljspeech.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.preprocess.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.waveflow.train.rst
│ │ │ ├── paddlespeech.t2s.exps.wavernn.rst
│ │ │ ├── paddlespeech.t2s.exps.wavernn.synthesize.rst
│ │ │ ├── paddlespeech.t2s.exps.wavernn.train.rst
│ │ │ ├── paddlespeech.t2s.frontend.arpabet.rst
│ │ │ ├── paddlespeech.t2s.frontend.g2pw.dataset.rst
│ │ │ ├── paddlespeech.t2s.frontend.g2pw.onnx_api.rst
│ │ │ ├── paddlespeech.t2s.frontend.g2pw.rst
│ │ │ ├── paddlespeech.t2s.frontend.g2pw.utils.rst
│ │ │ ├── paddlespeech.t2s.frontend.generate_lexicon.rst
│ │ │ ├── paddlespeech.t2s.frontend.mix_frontend.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.abbrrviation.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.acronyms.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.normalizer.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.numbers.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.rst
│ │ │ ├── paddlespeech.t2s.frontend.normalizer.width.rst
│ │ │ ├── paddlespeech.t2s.frontend.phonectic.rst
│ │ │ ├── paddlespeech.t2s.frontend.punctuation.rst
│ │ │ ├── paddlespeech.t2s.frontend.rst
│ │ │ ├── paddlespeech.t2s.frontend.tone_sandhi.rst
│ │ │ ├── paddlespeech.t2s.frontend.vocab.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_frontend.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.char_convert.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.chronology.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.constants.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.num.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.phonecode.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.quantifier.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.rst
│ │ │ ├── paddlespeech.t2s.frontend.zh_normalization.text_normlization.rst
│ │ │ ├── paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
│ │ │ ├── paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
│ │ │ ├── paddlespeech.t2s.models.ernie_sat.rst
│ │ │ ├── paddlespeech.t2s.models.fastspeech2.fastspeech2.rst
│ │ │ ├── paddlespeech.t2s.models.fastspeech2.fastspeech2_updater.rst
│ │ │ ├── paddlespeech.t2s.models.fastspeech2.rst
│ │ │ ├── paddlespeech.t2s.models.hifigan.hifigan.rst
│ │ │ ├── paddlespeech.t2s.models.hifigan.hifigan_updater.rst
│ │ │ ├── paddlespeech.t2s.models.hifigan.rst
│ │ │ ├── paddlespeech.t2s.models.melgan.melgan.rst
│ │ │ ├── paddlespeech.t2s.models.melgan.multi_band_melgan_updater.rst
│ │ │ ├── paddlespeech.t2s.models.melgan.rst
│ │ │ ├── paddlespeech.t2s.models.melgan.style_melgan.rst
│ │ │ ├── paddlespeech.t2s.models.melgan.style_melgan_updater.rst
│ │ │ ├── paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan.rst
│ │ │ ├── paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan_updater.rst
│ │ │ ├── paddlespeech.t2s.models.parallel_wavegan.rst
│ │ │ ├── paddlespeech.t2s.models.rst
│ │ │ ├── paddlespeech.t2s.models.speedyspeech.rst
│ │ │ ├── paddlespeech.t2s.models.speedyspeech.speedyspeech.rst
│ │ │ ├── paddlespeech.t2s.models.speedyspeech.speedyspeech_updater.rst
│ │ │ ├── paddlespeech.t2s.models.tacotron2.rst
│ │ │ ├── paddlespeech.t2s.models.tacotron2.tacotron2.rst
│ │ │ ├── paddlespeech.t2s.models.tacotron2.tacotron2_updater.rst
│ │ │ ├── paddlespeech.t2s.models.transformer_tts.rst
│ │ │ ├── paddlespeech.t2s.models.transformer_tts.transformer_tts.rst
│ │ │ ├── paddlespeech.t2s.models.transformer_tts.transformer_tts_updater.rst
│ │ │ ├── paddlespeech.t2s.models.vits.duration_predictor.rst
│ │ │ ├── paddlespeech.t2s.models.vits.flow.rst
│ │ │ ├── paddlespeech.t2s.models.vits.generator.rst
│ │ │ ├── paddlespeech.t2s.models.vits.monotonic_align.core.rst
│ │ │ ├── paddlespeech.t2s.models.vits.monotonic_align.rst
│ │ │ ├── paddlespeech.t2s.models.vits.monotonic_align.setup.rst
│ │ │ ├── paddlespeech.t2s.models.vits.posterior_encoder.rst
│ │ │ ├── paddlespeech.t2s.models.vits.residual_coupling.rst
│ │ │ ├── paddlespeech.t2s.models.vits.rst
│ │ │ ├── paddlespeech.t2s.models.vits.text_encoder.rst
│ │ │ ├── paddlespeech.t2s.models.vits.transform.rst
│ │ │ ├── paddlespeech.t2s.models.vits.vits.rst
│ │ │ ├── paddlespeech.t2s.models.vits.vits_updater.rst
│ │ │ ├── paddlespeech.t2s.models.vits.wavenet.residual_block.rst
│ │ │ ├── paddlespeech.t2s.models.vits.wavenet.rst
│ │ │ ├── paddlespeech.t2s.models.vits.wavenet.wavenet.rst
│ │ │ ├── paddlespeech.t2s.models.waveflow.rst
│ │ │ ├── paddlespeech.t2s.models.wavernn.rst
│ │ │ ├── paddlespeech.t2s.models.wavernn.wavernn.rst
│ │ │ ├── paddlespeech.t2s.models.wavernn.wavernn_updater.rst
│ │ │ ├── paddlespeech.t2s.modules.activation.rst
│ │ │ ├── paddlespeech.t2s.modules.causal_conv.rst
│ │ │ ├── paddlespeech.t2s.modules.conformer.convolution.rst
│ │ │ ├── paddlespeech.t2s.modules.conformer.encoder_layer.rst
│ │ │ ├── paddlespeech.t2s.modules.conformer.rst
│ │ │ ├── paddlespeech.t2s.modules.conv.rst
│ │ │ ├── paddlespeech.t2s.modules.geometry.rst
│ │ │ ├── paddlespeech.t2s.modules.layer_norm.rst
│ │ │ ├── paddlespeech.t2s.modules.losses.rst
│ │ │ ├── paddlespeech.t2s.modules.masked_fill.rst
│ │ │ ├── paddlespeech.t2s.modules.nets_utils.rst
│ │ │ ├── paddlespeech.t2s.modules.normalizer.rst
│ │ │ ├── paddlespeech.t2s.modules.positional_encoding.rst
│ │ │ ├── paddlespeech.t2s.modules.pqmf.rst
│ │ │ ├── paddlespeech.t2s.modules.predictor.duration_predictor.rst
│ │ │ ├── paddlespeech.t2s.modules.predictor.length_regulator.rst
│ │ │ ├── paddlespeech.t2s.modules.predictor.rst
│ │ │ ├── paddlespeech.t2s.modules.predictor.variance_predictor.rst
│ │ │ ├── paddlespeech.t2s.modules.residual_block.rst
│ │ │ ├── paddlespeech.t2s.modules.residual_stack.rst
│ │ │ ├── paddlespeech.t2s.modules.rst
│ │ │ ├── paddlespeech.t2s.modules.style_encoder.rst
│ │ │ ├── paddlespeech.t2s.modules.tacotron2.attentions.rst
│ │ │ ├── paddlespeech.t2s.modules.tacotron2.decoder.rst
│ │ │ ├── paddlespeech.t2s.modules.tacotron2.encoder.rst
│ │ │ ├── paddlespeech.t2s.modules.tacotron2.rst
│ │ │ ├── paddlespeech.t2s.modules.tade_res_block.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.attention.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.decoder.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.decoder_layer.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.embedding.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.encoder.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.encoder_layer.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.lightconv.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.mask.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.multi_layer_conv.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.positionwise_feed_forward.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.repeat.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.rst
│ │ │ ├── paddlespeech.t2s.modules.transformer.subsampling.rst
│ │ │ ├── paddlespeech.t2s.modules.upsample.rst
│ │ │ ├── paddlespeech.t2s.rst
│ │ │ ├── paddlespeech.t2s.training.cli.rst
│ │ │ ├── paddlespeech.t2s.training.default_config.rst
│ │ │ ├── paddlespeech.t2s.training.experiment.rst
│ │ │ ├── paddlespeech.t2s.training.extension.rst
│ │ │ ├── paddlespeech.t2s.training.extensions.evaluator.rst
│ │ │ ├── paddlespeech.t2s.training.extensions.rst
│ │ │ ├── paddlespeech.t2s.training.extensions.snapshot.rst
│ │ │ ├── paddlespeech.t2s.training.extensions.visualizer.rst
│ │ │ ├── paddlespeech.t2s.training.optimizer.rst
│ │ │ ├── paddlespeech.t2s.training.reporter.rst
│ │ │ ├── paddlespeech.t2s.training.rst
│ │ │ ├── paddlespeech.t2s.training.seeding.rst
│ │ │ ├── paddlespeech.t2s.training.trainer.rst
│ │ │ ├── paddlespeech.t2s.training.trigger.rst
│ │ │ ├── paddlespeech.t2s.training.triggers.interval_trigger.rst
│ │ │ ├── paddlespeech.t2s.training.triggers.limit_trigger.rst
│ │ │ ├── paddlespeech.t2s.training.triggers.rst
│ │ │ ├── paddlespeech.t2s.training.triggers.time_trigger.rst
│ │ │ ├── paddlespeech.t2s.training.updater.rst
│ │ │ ├── paddlespeech.t2s.training.updaters.rst
│ │ │ ├── paddlespeech.t2s.training.updaters.standard_updater.rst
│ │ │ ├── paddlespeech.t2s.utils.checkpoint.rst
│ │ │ ├── paddlespeech.t2s.utils.display.rst
│ │ │ ├── paddlespeech.t2s.utils.error_rate.rst
│ │ │ ├── paddlespeech.t2s.utils.h5_utils.rst
│ │ │ ├── paddlespeech.t2s.utils.internals.rst
│ │ │ ├── paddlespeech.t2s.utils.layer_tools.rst
│ │ │ ├── paddlespeech.t2s.utils.mp_tools.rst
│ │ │ ├── paddlespeech.t2s.utils.profiler.rst
│ │ │ ├── paddlespeech.t2s.utils.rst
│ │ │ ├── paddlespeech.t2s.utils.scheduler.rst
│ │ │ ├── paddlespeech.text.exps.ernie_linear.avg_model.rst
│ │ │ ├── paddlespeech.text.exps.ernie_linear.punc_restore.rst
│ │ │ ├── paddlespeech.text.exps.ernie_linear.rst
│ │ │ ├── paddlespeech.text.exps.ernie_linear.test.rst
│ │ │ ├── paddlespeech.text.exps.ernie_linear.train.rst
│ │ │ ├── paddlespeech.text.exps.rst
│ │ │ ├── paddlespeech.text.models.ernie_crf.model.rst
│ │ │ ├── paddlespeech.text.models.ernie_crf.rst
│ │ │ ├── paddlespeech.text.models.ernie_linear.dataset.rst
│ │ │ ├── paddlespeech.text.models.ernie_linear.ernie_linear.rst
│ │ │ ├── paddlespeech.text.models.ernie_linear.ernie_linear_updater.rst
│ │ │ ├── paddlespeech.text.models.ernie_linear.rst
│ │ │ ├── paddlespeech.text.models.rst
│ │ │ ├── paddlespeech.text.rst
│ │ │ ├── paddlespeech.utils.dynamic_import.rst
│ │ │ ├── paddlespeech.utils.env.rst
│ │ │ ├── paddlespeech.utils.rst
│ │ │ ├── paddlespeech.vector.cluster.diarization.rst
│ │ │ ├── paddlespeech.vector.cluster.plda.rst
│ │ │ ├── paddlespeech.vector.cluster.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.audio_processor.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.config.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.dataset_processors.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.inference.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.preprocess.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.random_cycle.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.speaker_verification_dataset.rst
│ │ │ ├── paddlespeech.vector.exps.ge2e.train.rst
│ │ │ ├── paddlespeech.vector.exps.rst
│ │ │ ├── paddlespeech.vector.io.augment.rst
│ │ │ ├── paddlespeech.vector.io.batch.rst
│ │ │ ├── paddlespeech.vector.io.dataset.rst
│ │ │ ├── paddlespeech.vector.io.dataset_from_json.rst
│ │ │ ├── paddlespeech.vector.io.embedding_norm.rst
│ │ │ ├── paddlespeech.vector.io.rst
│ │ │ ├── paddlespeech.vector.io.signal_processing.rst
│ │ │ ├── paddlespeech.vector.models.ecapa_tdnn.rst
│ │ │ ├── paddlespeech.vector.models.lstm_speaker_encoder.rst
│ │ │ ├── paddlespeech.vector.models.rst
│ │ │ ├── paddlespeech.vector.modules.layer.rst
│ │ │ ├── paddlespeech.vector.modules.loss.rst
│ │ │ ├── paddlespeech.vector.modules.rst
│ │ │ ├── paddlespeech.vector.modules.sid_model.rst
│ │ │ ├── paddlespeech.vector.rst
│ │ │ ├── paddlespeech.vector.training.rst
│ │ │ ├── paddlespeech.vector.training.scheduler.rst
│ │ │ ├── paddlespeech.vector.training.seeding.rst
│ │ │ ├── paddlespeech.vector.utils.rst
│ │ │ ├── paddlespeech.vector.utils.time.rst
│ │ │ ├── paddlespeech.vector.utils.vector_utils.rst
│ │ │ └── paddlespeech.version.rst
│ │ ├── asr/
│ │ │ ├── PPASR.md
│ │ │ ├── PPASR_cn.md
│ │ │ ├── data_preparation.md
│ │ │ ├── feature_list.md
│ │ │ ├── models_introduction.md
│ │ │ ├── ngram_lm.md
│ │ │ └── quick_start.md
│ │ ├── audio/
│ │ │ ├── _static/
│ │ │ │ └── custom.css
│ │ │ ├── _templates/
│ │ │ │ ├── module.rst_t
│ │ │ │ ├── package.rst_t
│ │ │ │ └── toc.rst_t
│ │ │ ├── conf.py
│ │ │ └── index.rst
│ │ ├── audio_api/
│ │ │ ├── modules.rst
│ │ │ ├── paddleaudio.backends.common.rst
│ │ │ ├── paddleaudio.backends.no_backend.rst
│ │ │ ├── paddleaudio.backends.rst
│ │ │ ├── paddleaudio.backends.soundfile_backend.rst
│ │ │ ├── paddleaudio.backends.sox_io_backend.rst
│ │ │ ├── paddleaudio.backends.utils.rst
│ │ │ ├── paddleaudio.compliance.kaldi.rst
│ │ │ ├── paddleaudio.compliance.librosa.rst
│ │ │ ├── paddleaudio.compliance.rst
│ │ │ ├── paddleaudio.datasets.dataset.rst
│ │ │ ├── paddleaudio.datasets.esc50.rst
│ │ │ ├── paddleaudio.datasets.gtzan.rst
│ │ │ ├── paddleaudio.datasets.hey_snips.rst
│ │ │ ├── paddleaudio.datasets.rirs_noises.rst
│ │ │ ├── paddleaudio.datasets.rst
│ │ │ ├── paddleaudio.datasets.tess.rst
│ │ │ ├── paddleaudio.datasets.urban_sound.rst
│ │ │ ├── paddleaudio.datasets.voxceleb.rst
│ │ │ ├── paddleaudio.features.layers.rst
│ │ │ ├── paddleaudio.features.rst
│ │ │ ├── paddleaudio.functional.functional.rst
│ │ │ ├── paddleaudio.functional.rst
│ │ │ ├── paddleaudio.functional.window.rst
│ │ │ ├── paddleaudio.kaldi.kaldi.rst
│ │ │ ├── paddleaudio.kaldi.rst
│ │ │ ├── paddleaudio.metric.eer.rst
│ │ │ ├── paddleaudio.metric.rst
│ │ │ ├── paddleaudio.rst
│ │ │ ├── paddleaudio.sox_effects.rst
│ │ │ ├── paddleaudio.sox_effects.sox_effects.rst
│ │ │ ├── paddleaudio.utils.download.rst
│ │ │ ├── paddleaudio.utils.env.rst
│ │ │ ├── paddleaudio.utils.error.rst
│ │ │ ├── paddleaudio.utils.log.rst
│ │ │ ├── paddleaudio.utils.numeric.rst
│ │ │ ├── paddleaudio.utils.rst
│ │ │ ├── paddleaudio.utils.sox_utils.rst
│ │ │ ├── paddleaudio.utils.tensor_utils.rst
│ │ │ └── paddleaudio.utils.time.rst
│ │ ├── cls/
│ │ │ ├── custom_dataset.md
│ │ │ └── quick_start.md
│ │ ├── conf.py
│ │ ├── demo_video.rst
│ │ ├── dependencies.md
│ │ ├── index.rst
│ │ ├── install.md
│ │ ├── install_cn.md
│ │ ├── introduction.md
│ │ ├── reference.md
│ │ ├── released_model.md
│ │ ├── streaming_asr_demo_video.rst
│ │ ├── streaming_tts_demo_video.rst
│ │ ├── tts/
│ │ │ ├── PPTTS.md
│ │ │ ├── PPTTS_cn.md
│ │ │ ├── README.md
│ │ │ ├── advanced_usage.md
│ │ │ ├── demo.rst
│ │ │ ├── demo_2.rst
│ │ │ ├── gan_vocoder.md
│ │ │ ├── models_introduction.md
│ │ │ ├── quick_start.md
│ │ │ ├── quick_start_cn.md
│ │ │ ├── svs_music_score.md
│ │ │ ├── test_sentence.txt
│ │ │ ├── tts_datasets.md
│ │ │ ├── tts_papers.md
│ │ │ └── zh_text_frontend.md
│ │ ├── tts_demo_video.rst
│ │ └── vpr/
│ │ ├── PPVPR.md
│ │ └── PPVPR_cn.md
│ ├── topic/
│ │ ├── ctc/
│ │ │ ├── ctc_loss.ipynb
│ │ │ ├── ctc_loss_compare.ipynb
│ │ │ └── ctc_loss_speed_compare.ipynb
│ │ ├── frontend/
│ │ │ └── g2p.md
│ │ ├── gan_vocoder/
│ │ │ └── gan_vocoder.ipynb
│ │ └── package_release/
│ │ └── python_package_release.md
│ └── tutorial/
│ ├── .gitkeep
│ ├── asr/
│ │ ├── tutorial_deepspeech2.ipynb
│ │ └── tutorial_transformer.ipynb
│ ├── cls/
│ │ └── cls_tutorial.ipynb
│ ├── st/
│ │ └── st_tutorial.ipynb
│ └── tts/
│ └── tts_tutorial.ipynb
├── examples/
│ ├── aishell/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── asr0/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── conf/
│ │ │ │ ├── deepspeech2.yaml
│ │ │ │ ├── deepspeech2_online.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ └── tuning/
│ │ │ │ ├── chunk_decode.yaml
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── download_lm_ch.sh
│ │ │ │ ├── export.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_export.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── asr1/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── conf/
│ │ │ │ ├── augmentation.json
│ │ │ │ ├── chunk_conformer.yaml
│ │ │ │ ├── chunk_roformer.yaml
│ │ │ │ ├── chunk_roformer_bidecoder.yaml
│ │ │ │ ├── chunk_squeezeformer.yaml
│ │ │ │ ├── conformer.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ ├── squeezeformer.yaml
│ │ │ │ ├── transformer.yaml
│ │ │ │ └── tuning/
│ │ │ │ ├── chunk_decode.yaml
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── aishell_train_lms.sh
│ │ │ │ ├── align.sh
│ │ │ │ ├── data.sh
│ │ │ │ ├── export.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ ├── tlg.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── asr3/
│ │ ├── README.md
│ │ ├── RESULT.md
│ │ ├── cmd.sh
│ │ ├── conf/
│ │ │ ├── preprocess.yaml
│ │ │ ├── train_with_wav2vec.yaml
│ │ │ ├── tuning/
│ │ │ │ └── decode.yaml
│ │ │ ├── wav2vec2ASR.yaml
│ │ │ └── wav2vec2ASR_adadelta.yaml
│ │ ├── local/
│ │ │ ├── aishell_prepare.py
│ │ │ ├── data.sh
│ │ │ ├── test.sh
│ │ │ ├── test_wav.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── aishell3/
│ │ ├── README.md
│ │ ├── ernie_sat/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── tts3/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ ├── conformer.yaml
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── ort_predict.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ └── run.sh
│ │ ├── vc0/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── train.sh
│ │ │ │ └── voice_cloning.sh
│ │ │ └── run.sh
│ │ ├── vc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── voice_cloning.sh
│ │ │ └── run.sh
│ │ ├── vc2/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ └── voice_cloning.sh
│ │ │ └── run.sh
│ │ ├── vits/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── vits-vc/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── train.sh
│ │ │ │ └── voice_cloning.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ └── preprocess.sh
│ │ │ └── run.sh
│ │ └── voc5/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ └── run.sh
│ ├── aishell3_vctk/
│ │ ├── README.md
│ │ └── ernie_sat/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ ├── local/
│ │ │ ├── preprocess.sh
│ │ │ └── synthesize_e2e.sh
│ │ └── run.sh
│ ├── ami/
│ │ ├── README.md
│ │ └── sd0/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── ecapa_tdnn.yaml
│ │ ├── local/
│ │ │ ├── ami_prepare.py
│ │ │ ├── ami_splits.py
│ │ │ ├── compute_embdding.py
│ │ │ ├── dataio.py
│ │ │ ├── experiment.py
│ │ │ └── process.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── callcenter/
│ │ ├── README.md
│ │ └── asr1/
│ │ ├── .gitignore
│ │ ├── RESULTS.md
│ │ ├── conf/
│ │ │ ├── augmentation.json
│ │ │ ├── chunk_conformer.yaml
│ │ │ ├── conformer.yaml
│ │ │ ├── preprocess.yaml
│ │ │ └── tuning/
│ │ │ ├── chunk_decode.yaml
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── align.sh
│ │ │ ├── data.sh
│ │ │ ├── download_lm_ch.sh
│ │ │ ├── export.sh
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── canton/
│ │ └── tts3/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ ├── local/
│ │ │ ├── inference.sh
│ │ │ ├── ort_predict.sh
│ │ │ ├── preprocess.sh
│ │ │ └── synthesize_e2e.sh
│ │ └── run.sh
│ ├── csmsc/
│ │ ├── README.md
│ │ ├── jets/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── tts0/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── tts2/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── inference_mlu.sh
│ │ │ │ ├── inference_npu.sh
│ │ │ │ ├── inference_xpu.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── ort_predict.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ ├── synthesize_e2e_mlu.sh
│ │ │ │ ├── synthesize_e2e_npu.sh
│ │ │ │ ├── synthesize_e2e_xpu.sh
│ │ │ │ ├── synthesize_mlu.sh
│ │ │ │ ├── synthesize_npu.sh
│ │ │ │ ├── synthesize_xpu.sh
│ │ │ │ ├── train.sh
│ │ │ │ ├── train_mlu.sh
│ │ │ │ ├── train_npu.sh
│ │ │ │ └── train_xpu.sh
│ │ │ ├── path.sh
│ │ │ ├── run.sh
│ │ │ ├── run_mlu.sh
│ │ │ ├── run_npu.sh
│ │ │ └── run_xpu.sh
│ │ ├── tts3/
│ │ │ ├── README.md
│ │ │ ├── README_cn.md
│ │ │ ├── conf/
│ │ │ │ ├── cnndecoder.yaml
│ │ │ │ ├── conformer.yaml
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── PTQ_dynamic.sh
│ │ │ │ ├── PTQ_static.sh
│ │ │ │ ├── export2lite.sh
│ │ │ │ ├── inference.sh
│ │ │ │ ├── inference_streaming.sh
│ │ │ │ ├── inference_xpu.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── lite_predict_streaming.sh
│ │ │ │ ├── ort_predict.sh
│ │ │ │ ├── ort_predict_streaming.sh
│ │ │ │ ├── paddle2onnx.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── simple.lexicon
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ ├── synthesize_e2e_xpu.sh
│ │ │ │ ├── synthesize_streaming.sh
│ │ │ │ ├── synthesize_xpu.sh
│ │ │ │ ├── train.sh
│ │ │ │ └── train_xpu.sh
│ │ │ ├── path.sh
│ │ │ ├── run.sh
│ │ │ ├── run_cnndecoder.sh
│ │ │ └── run_xpu.sh
│ │ ├── tts3_rhy/
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ └── synthesize_e2e.sh
│ │ │ └── run.sh
│ │ ├── vits/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── PTQ_static.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc3/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ ├── default.yaml
│ │ │ │ └── finetune.yaml
│ │ │ ├── local/
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc4/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ └── synthesize.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc5/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ ├── default.yaml
│ │ │ │ ├── finetune.yaml
│ │ │ │ └── iSTFT.yaml
│ │ │ ├── finetune.sh
│ │ │ ├── iSTFTNet.md
│ │ │ ├── local/
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── voc6/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ ├── local/
│ │ │ ├── preprocess.sh
│ │ │ └── synthesize.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── esc50/
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ └── cls0/
│ │ ├── conf/
│ │ │ └── panns.yaml
│ │ ├── local/
│ │ │ ├── export.sh
│ │ │ ├── infer.sh
│ │ │ ├── static_model_infer.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── hey_snips/
│ │ ├── README.md
│ │ └── kws0/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── mdtc.yaml
│ │ ├── local/
│ │ │ ├── plot.sh
│ │ │ ├── score.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── iwslt2012/
│ │ └── punc0/
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── conf/
│ │ │ ├── default.yaml
│ │ │ ├── ernie-3.0-base.yaml
│ │ │ ├── ernie-3.0-medium.yaml
│ │ │ ├── ernie-3.0-mini.yaml
│ │ │ ├── ernie-3.0-nano-zh.yaml
│ │ │ └── ernie-tiny.yaml
│ │ ├── local/
│ │ │ ├── data.sh
│ │ │ ├── preprocess.py
│ │ │ ├── punc_restore.sh
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── librispeech/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── asr0/
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── conf/
│ │ │ │ ├── deepspeech2.yaml
│ │ │ │ ├── deepspeech2_online.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ └── tuning/
│ │ │ │ ├── chunk_decode.yaml
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── download_lm_en.sh
│ │ │ │ ├── export.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── asr1/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── cmd.sh
│ │ │ ├── conf/
│ │ │ │ ├── augmentation.json
│ │ │ │ ├── chunk_conformer.yaml
│ │ │ │ ├── chunk_transformer.yaml
│ │ │ │ ├── conformer.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ ├── transformer.yaml
│ │ │ │ └── tuning/
│ │ │ │ ├── chunk_decode.yaml
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── align.sh
│ │ │ │ ├── data.sh
│ │ │ │ ├── download_lm_en.sh
│ │ │ │ ├── export.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── asr2/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── cmd.sh
│ │ │ ├── conf/
│ │ │ │ ├── augmentation.json
│ │ │ │ ├── decode/
│ │ │ │ │ ├── decode.yaml
│ │ │ │ │ ├── decode_att.yaml
│ │ │ │ │ ├── decode_base.yaml
│ │ │ │ │ ├── decode_ctc.yaml
│ │ │ │ │ └── decode_wo_lm.yaml
│ │ │ │ ├── fbank.conf
│ │ │ │ ├── lm/
│ │ │ │ │ └── transformer.yaml
│ │ │ │ ├── pitch.conf
│ │ │ │ ├── preprocess.yaml
│ │ │ │ └── transformer.yaml
│ │ │ ├── local/
│ │ │ │ ├── align.sh
│ │ │ │ ├── cacu_perplexity.sh
│ │ │ │ ├── data.sh
│ │ │ │ ├── data_prep.sh
│ │ │ │ ├── download_lm_en.sh
│ │ │ │ ├── espnet_json_to_manifest.py
│ │ │ │ ├── export.sh
│ │ │ │ ├── recog.sh
│ │ │ │ ├── test.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── asr3/
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── cmd.sh
│ │ │ ├── conf/
│ │ │ │ ├── preprocess.yaml
│ │ │ │ ├── tuning/
│ │ │ │ │ └── decode.yaml
│ │ │ │ └── wav2vec2ASR.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── asr4/
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── cmd.sh
│ │ │ ├── conf/
│ │ │ │ ├── config.json
│ │ │ │ ├── hubertASR.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ ├── preprocessor_config.json
│ │ │ │ └── tuning/
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── test.sh
│ │ │ │ ├── test_wav.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── asr5/
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── avg.sh
│ │ ├── cmd.sh
│ │ ├── compute_wer.py
│ │ ├── conf/
│ │ │ ├── preprocess.yaml
│ │ │ ├── preprocessor_config.json
│ │ │ ├── tuning/
│ │ │ │ └── decode.yaml
│ │ │ └── wavlmASR.yaml
│ │ ├── local/
│ │ │ ├── data.sh
│ │ │ ├── test.sh
│ │ │ ├── test_wav.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── ljspeech/
│ │ ├── README.md
│ │ ├── tts0/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ └── run.sh
│ │ ├── tts1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── tts3/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── ort_predict.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ └── run.sh
│ │ ├── voc0/
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ └── preprocess.sh
│ │ │ └── run.sh
│ │ └── voc5/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ └── run.sh
│ ├── mustc/
│ │ └── st1/
│ │ ├── cmd.sh
│ │ ├── conf/
│ │ │ ├── fbank.conf
│ │ │ ├── pitch.conf
│ │ │ ├── transformer_de.yaml
│ │ │ ├── transformer_es.yaml
│ │ │ ├── transformer_fr.yaml
│ │ │ ├── transformer_it.yaml
│ │ │ ├── transformer_nl.yaml
│ │ │ ├── transformer_pt.yaml
│ │ │ ├── transformer_ro.yaml
│ │ │ └── transformer_ru.yaml
│ │ ├── local/
│ │ │ ├── augmentation.json
│ │ │ ├── data.sh
│ │ │ ├── data_prep.sh
│ │ │ ├── divide_lang.sh
│ │ │ ├── remove_punctuation.pl
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── opencpop/
│ │ ├── README.md
│ │ ├── svs1/
│ │ │ ├── README.md
│ │ │ ├── README_cn.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── pinyin_to_phone.txt
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ ├── synthesize_e2e.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── dygraph_to_static.sh
│ │ │ │ └── preprocess.sh
│ │ │ └── run.sh
│ │ └── voc5/
│ │ ├── conf/
│ │ │ ├── default.yaml
│ │ │ └── finetune.yaml
│ │ ├── finetune.sh
│ │ ├── local/
│ │ │ └── dygraph_to_static.sh
│ │ └── run.sh
│ ├── other/
│ │ ├── augmentation/
│ │ │ └── augmentation.json
│ │ ├── cc-cedict/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ └── parser.py
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── g2p/
│ │ │ ├── README.md
│ │ │ ├── compare_badcase.py
│ │ │ ├── get_g2p_data.py
│ │ │ ├── path.sh
│ │ │ ├── run.sh
│ │ │ └── test_g2p.py
│ │ ├── ge2e/
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── mfa/
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ ├── detect_oov.py
│ │ │ │ ├── generate_canton_lexicon_wavlabs.py
│ │ │ │ ├── generate_lexicon.py
│ │ │ │ ├── reorganize_aishell3.py
│ │ │ │ ├── reorganize_baker.py
│ │ │ │ ├── reorganize_ljspeech.py
│ │ │ │ └── reorganize_vctk.py
│ │ │ ├── run.sh
│ │ │ └── run_canton.sh
│ │ ├── ngram_lm/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ └── s0/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── data/
│ │ │ │ ├── README.md
│ │ │ │ ├── custom_confusion.txt
│ │ │ │ └── text_correct.txt
│ │ │ ├── local/
│ │ │ │ ├── build_zh_lm.sh
│ │ │ │ ├── download_lm_zh.sh
│ │ │ │ └── kenlm_score_test.py
│ │ │ ├── path.sh
│ │ │ ├── requirements.txt
│ │ │ └── run.sh
│ │ ├── punctuation_restoration/
│ │ │ └── README.md
│ │ ├── rhy/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── data/
│ │ │ │ └── rhy_token
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── pre_for_sp_aishell.py
│ │ │ │ ├── pre_for_sp_csmsc.py
│ │ │ │ ├── rhy_predict.sh
│ │ │ │ ├── test.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── spm/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── path.sh
│ │ │ ├── run.sh
│ │ │ └── text
│ │ ├── tn/
│ │ │ ├── README.md
│ │ │ ├── data/
│ │ │ │ └── textnorm_test_cases.txt
│ │ │ ├── get_textnorm_data.py
│ │ │ ├── path.sh
│ │ │ ├── run.sh
│ │ │ └── test_textnorm.py
│ │ └── tts_finetune/
│ │ └── tts3/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ ├── fastspeech2_layers.txt
│ │ │ └── finetune.yaml
│ │ ├── local/
│ │ │ ├── check_oov.py
│ │ │ ├── extract_feature.py
│ │ │ ├── finetune.py
│ │ │ ├── generate_duration.py
│ │ │ ├── get_mfa_result.py
│ │ │ └── prepare_env.py
│ │ ├── path.sh
│ │ ├── run.sh
│ │ ├── run_en.sh
│ │ └── run_mix.sh
│ ├── tal_cs/
│ │ └── asr1/
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── conf/
│ │ │ ├── chunk_conformer.yaml
│ │ │ ├── conformer.yaml
│ │ │ ├── preprocess.yaml
│ │ │ └── tuning/
│ │ │ ├── chunk_decode.yaml
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── data.sh
│ │ │ ├── test.sh
│ │ │ ├── test_wav.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── ted_en_zh/
│ │ ├── README.md
│ │ ├── st0/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── conf/
│ │ │ │ ├── preprocess.yaml
│ │ │ │ ├── transformer.yaml
│ │ │ │ ├── transformer_mtl_noam.yaml
│ │ │ │ └── tuning/
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── test.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── st1/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── cmd.sh
│ │ ├── conf/
│ │ │ ├── fbank.conf
│ │ │ ├── pitch.conf
│ │ │ ├── preprocess.yaml
│ │ │ ├── transformer.yaml
│ │ │ ├── transformer_mtl_noam.yaml
│ │ │ └── tuning/
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── convert_torch_to_paddle.py
│ │ │ ├── data.sh
│ │ │ ├── divide_lang.sh
│ │ │ ├── download_pretrain.sh
│ │ │ ├── remove_punctuation.pl
│ │ │ ├── ted_en_zh.py
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── tess/
│ │ ├── README.md
│ │ └── cls0/
│ │ ├── conf/
│ │ │ ├── panns_logmelspectrogram.yaml
│ │ │ ├── panns_melspectrogram.yaml
│ │ │ ├── panns_mfcc.yaml
│ │ │ └── panns_spectrogram.yaml
│ │ ├── local/
│ │ │ ├── train.py
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── thchs30/
│ │ ├── README.md
│ │ └── align0/
│ │ ├── README.md
│ │ ├── data/
│ │ │ └── dict/
│ │ │ └── syllable.lexicon
│ │ ├── local/
│ │ │ ├── data.sh
│ │ │ ├── gen_word2phone.py
│ │ │ └── reorganize_thchs30.py
│ │ ├── path.sh
│ │ └── run.sh
│ ├── timit/
│ │ ├── README.md
│ │ └── asr1/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── conf/
│ │ │ ├── augmentation.json
│ │ │ ├── dev_spk.list
│ │ │ ├── preprocess.yaml
│ │ │ ├── test_spk.list
│ │ │ ├── transformer.yaml
│ │ │ └── tuning/
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── align.sh
│ │ │ ├── data.sh
│ │ │ ├── export.sh
│ │ │ ├── test.sh
│ │ │ ├── timit_data_prep.sh
│ │ │ ├── timit_norm_trans.pl
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── tiny/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── asr0/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ ├── deepspeech2.yaml
│ │ │ │ ├── deepspeech2_online.yaml
│ │ │ │ ├── preprocess.yaml
│ │ │ │ └── tuning/
│ │ │ │ ├── chunk_decode.yaml
│ │ │ │ └── decode.yaml
│ │ │ ├── local/
│ │ │ │ ├── data.sh
│ │ │ │ ├── download_lm_en.sh
│ │ │ │ ├── export.sh
│ │ │ │ ├── test.sh
│ │ │ │ └── train.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── asr1/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── conf/
│ │ │ ├── augmentation.json
│ │ │ ├── chunk_confermer.yaml
│ │ │ ├── chunk_transformer.yaml
│ │ │ ├── conformer.yaml
│ │ │ ├── preprocess.yaml
│ │ │ ├── transformer.yaml
│ │ │ └── tuning/
│ │ │ ├── chunk_decode.yaml
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── align.sh
│ │ │ ├── data.sh
│ │ │ ├── export.sh
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── vctk/
│ │ ├── README.md
│ │ ├── ernie_sat/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ └── run.sh
│ │ ├── tts3/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── inference.sh
│ │ │ │ ├── lite_predict.sh
│ │ │ │ ├── ort_predict.sh
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── synthesize.sh
│ │ │ │ └── synthesize_e2e.sh
│ │ │ └── run.sh
│ │ ├── vc3/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ ├── preprocess.sh
│ │ │ │ ├── train.sh
│ │ │ │ └── voice_conversion.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── voc1/
│ │ │ ├── README.md
│ │ │ ├── conf/
│ │ │ │ └── default.yaml
│ │ │ ├── local/
│ │ │ │ └── preprocess.sh
│ │ │ └── run.sh
│ │ └── voc5/
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── default.yaml
│ │ └── run.sh
│ ├── voxceleb/
│ │ ├── README.md
│ │ └── sv0/
│ │ ├── README.md
│ │ ├── RESULT.md
│ │ ├── conf/
│ │ │ ├── ecapa_tdnn.yaml
│ │ │ └── ecapa_tdnn_small.yaml
│ │ ├── local/
│ │ │ ├── convert.sh
│ │ │ ├── data.sh
│ │ │ ├── data_prepare.py
│ │ │ ├── emb.sh
│ │ │ ├── make_rirs_noise_csv_dataset_from_json.py
│ │ │ ├── make_vox_csv_dataset_from_json.py
│ │ │ ├── make_voxceleb_kaldi_trial.py
│ │ │ ├── test.sh
│ │ │ └── train.sh
│ │ ├── path.sh
│ │ └── run.sh
│ ├── wenetspeech/
│ │ ├── README.md
│ │ ├── asr0/
│ │ │ └── RESULTS.md
│ │ └── asr1/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── RESULTS.md
│ │ ├── conf/
│ │ │ ├── chunk_conformer.yaml
│ │ │ ├── chunk_conformer_u2pp.yaml
│ │ │ ├── conformer.yaml
│ │ │ ├── preprocess.yaml
│ │ │ └── tuning/
│ │ │ ├── chunk_decode.yaml
│ │ │ └── decode.yaml
│ │ ├── local/
│ │ │ ├── data.sh
│ │ │ ├── export.sh
│ │ │ ├── extract_meta.py
│ │ │ ├── process_opus.py
│ │ │ ├── quant.sh
│ │ │ ├── test.sh
│ │ │ ├── test_wav.sh
│ │ │ ├── train.sh
│ │ │ └── wenetspeech_data_prep.sh
│ │ ├── path.sh
│ │ └── run.sh
│ └── zh_en_tts/
│ └── tts3/
│ ├── .gitignore
│ ├── README.md
│ ├── conf/
│ │ └── default.yaml
│ ├── local/
│ │ ├── inference.sh
│ │ ├── mfa_download.sh
│ │ ├── model_download.sh
│ │ ├── ort_predict.sh
│ │ ├── preprocess.sh
│ │ ├── synthesize.sh
│ │ └── synthesize_e2e.sh
│ └── run.sh
├── paddlespeech/
│ ├── __init__.py
│ ├── audio/
│ │ ├── .gitignore
│ │ ├── __init__.py
│ │ ├── backends/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ └── soundfile_backend.py
│ │ ├── compliance/
│ │ │ ├── __init__.py
│ │ │ ├── kaldi.py
│ │ │ └── librosa.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ ├── esc50.py
│ │ │ └── voxceleb.py
│ │ ├── functional/
│ │ │ ├── __init__.py
│ │ │ ├── functional.py
│ │ │ └── window.py
│ │ ├── streamdata/
│ │ │ ├── __init__.py
│ │ │ ├── autodecode.py
│ │ │ ├── cache.py
│ │ │ ├── compat.py
│ │ │ ├── extradatasets.py
│ │ │ ├── filters.py
│ │ │ ├── gopen.py
│ │ │ ├── handlers.py
│ │ │ ├── mix.py
│ │ │ ├── paddle_utils.py
│ │ │ ├── pipeline.py
│ │ │ ├── shardlists.py
│ │ │ ├── soundfile.py
│ │ │ ├── tariterators.py
│ │ │ ├── utils.py
│ │ │ └── writer.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ ├── text_featurizer.py
│ │ │ └── utility.py
│ │ ├── transform/
│ │ │ ├── __init__.py
│ │ │ ├── add_deltas.py
│ │ │ ├── channel_selector.py
│ │ │ ├── cmvn.py
│ │ │ ├── functional.py
│ │ │ ├── perturb.py
│ │ │ ├── spec_augment.py
│ │ │ ├── spectrogram.py
│ │ │ ├── transform_interface.py
│ │ │ ├── transformation.py
│ │ │ └── wpe.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── check_kwargs.py
│ │ ├── download.py
│ │ ├── dynamic_import.py
│ │ ├── error.py
│ │ ├── log.py
│ │ ├── numeric.py
│ │ ├── tensor_utils.py
│ │ └── time.py
│ ├── audiotools/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── _julius.py
│ │ │ ├── audio_signal.py
│ │ │ ├── display.py
│ │ │ ├── dsp.py
│ │ │ ├── effects.py
│ │ │ ├── ffmpeg.py
│ │ │ ├── loudness.py
│ │ │ └── util.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── datasets.py
│ │ │ ├── preprocess.py
│ │ │ └── transforms.py
│ │ ├── metrics/
│ │ │ ├── __init__.py
│ │ │ └── quality.py
│ │ ├── ml/
│ │ │ ├── __init__.py
│ │ │ ├── accelerator.py
│ │ │ ├── basemodel.py
│ │ │ └── decorators.py
│ │ └── post.py
│ ├── cli/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── __init__.py
│ │ ├── asr/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── base_commands.py
│ │ ├── cls/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── download.py
│ │ ├── entry.py
│ │ ├── executor.py
│ │ ├── kws/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── log.py
│ │ ├── ssl/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── st/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── tts/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ ├── utils.py
│ │ ├── vector/
│ │ │ ├── __init__.py
│ │ │ └── infer.py
│ │ └── whisper/
│ │ ├── __init__.py
│ │ └── infer.py
│ ├── cls/
│ │ ├── __init__.py
│ │ ├── exps/
│ │ │ ├── __init__.py
│ │ │ └── panns/
│ │ │ ├── __init__.py
│ │ │ ├── deploy/
│ │ │ │ ├── __init__.py
│ │ │ │ └── predict.py
│ │ │ ├── export_model.py
│ │ │ ├── predict.py
│ │ │ └── train.py
│ │ └── models/
│ │ ├── __init__.py
│ │ └── panns/
│ │ ├── __init__.py
│ │ ├── classifier.py
│ │ └── panns.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── aidatatang_200zh/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── aidatatang_200zh.py
│ │ ├── aishell/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── aishell.py
│ │ ├── download.py
│ │ └── s2t/
│ │ ├── __init__.py
│ │ ├── avg_model.py
│ │ ├── build_vocab.py
│ │ ├── compute_mean_std.py
│ │ ├── compute_wer.py
│ │ ├── format_data.py
│ │ └── format_rsl.py
│ ├── kws/
│ │ ├── __init__.py
│ │ ├── exps/
│ │ │ ├── __init__.py
│ │ │ └── mdtc/
│ │ │ ├── __init__.py
│ │ │ ├── collate.py
│ │ │ ├── compute_det.py
│ │ │ ├── plot_det_curve.py
│ │ │ ├── score.py
│ │ │ └── train.py
│ │ └── models/
│ │ ├── __init__.py
│ │ ├── loss.py
│ │ └── mdtc.py
│ ├── resource/
│ │ ├── __init__.py
│ │ ├── model_alias.py
│ │ ├── pretrained_models.py
│ │ └── resource.py
│ ├── s2t/
│ │ ├── __init__.py
│ │ ├── decoders/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── beam_search/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── batch_beam_search.py
│ │ │ │ └── beam_search.py
│ │ │ ├── ctcdecoder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── decoders_deprecated.py
│ │ │ │ ├── scorer_deprecated.py
│ │ │ │ ├── swig_wrapper.py
│ │ │ │ └── tests/
│ │ │ │ └── test_decoders.py
│ │ │ ├── recog.py
│ │ │ ├── recog_bin.py
│ │ │ ├── scorers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ctc.py
│ │ │ │ ├── ctc_prefix_score.py
│ │ │ │ ├── length_bonus.py
│ │ │ │ ├── ngram.py
│ │ │ │ └── scorer_interface.py
│ │ │ └── utils.py
│ │ ├── exps/
│ │ │ ├── __init__.py
│ │ │ ├── deepspeech2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── deploy/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── client.py
│ │ │ │ │ │ ├── record.py
│ │ │ │ │ │ ├── runtime.py
│ │ │ │ │ │ ├── send.py
│ │ │ │ │ │ └── server.py
│ │ │ │ │ ├── export.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ ├── test_export.py
│ │ │ │ │ ├── test_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ ├── hubert/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ ├── test_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ ├── lm/
│ │ │ │ └── transformer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── cacu_perplexity.py
│ │ │ │ └── lm_cacu_perplexity.py
│ │ │ ├── u2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── alignment.py
│ │ │ │ │ ├── export.py
│ │ │ │ │ ├── quant.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ ├── test_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ ├── model.py
│ │ │ │ └── trainer.py
│ │ │ ├── u2_kaldi/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── recog.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ ├── u2_st/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── export.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ ├── wav2vec2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ ├── test_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ ├── wavlm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bin/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ ├── test_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ └── model.py
│ │ │ └── whisper/
│ │ │ └── test_wav.py
│ │ ├── frontend/
│ │ │ ├── __init__.py
│ │ │ ├── audio.py
│ │ │ ├── augmentor/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── augmentation.py
│ │ │ │ ├── base.py
│ │ │ │ ├── impulse_response.py
│ │ │ │ ├── noise_perturb.py
│ │ │ │ ├── online_bayesian_normalization.py
│ │ │ │ ├── resample.py
│ │ │ │ ├── shift_perturb.py
│ │ │ │ ├── spec_augment.py
│ │ │ │ ├── speed_perturb.py
│ │ │ │ └── volume_perturb.py
│ │ │ ├── featurizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── audio_featurizer.py
│ │ │ │ ├── speech_featurizer.py
│ │ │ │ └── text_featurizer.py
│ │ │ ├── normalizer.py
│ │ │ ├── speech.py
│ │ │ └── utility.py
│ │ ├── io/
│ │ │ ├── __init__.py
│ │ │ ├── batchfy.py
│ │ │ ├── collator.py
│ │ │ ├── converter.py
│ │ │ ├── dataloader.py
│ │ │ ├── dataset.py
│ │ │ ├── reader.py
│ │ │ ├── sampler.py
│ │ │ ├── speechbrain/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── batch.py
│ │ │ │ ├── data_pipeline.py
│ │ │ │ ├── data_utils.py
│ │ │ │ ├── dataio.py
│ │ │ │ ├── dataloader.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── depgraph.py
│ │ │ │ ├── make_dataloader.py
│ │ │ │ ├── sampler.py
│ │ │ │ └── sb_pipeline.py
│ │ │ └── utility.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── asr_interface.py
│ │ │ ├── ds2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conv.py
│ │ │ │ └── deepspeech2.py
│ │ │ ├── hubert/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hubert_ASR.py
│ │ │ │ └── modules/
│ │ │ │ ├── __init__.py
│ │ │ │ └── hubert_model.py
│ │ │ ├── lm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataset.py
│ │ │ │ └── transformer.py
│ │ │ ├── lm_interface.py
│ │ │ ├── st_interface.py
│ │ │ ├── u2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── u2.py
│ │ │ │ └── updater.py
│ │ │ ├── u2_st/
│ │ │ │ ├── __init__.py
│ │ │ │ └── u2_st.py
│ │ │ ├── wav2vec2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── VanillaNN.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── activations.py
│ │ │ │ │ ├── containers.py
│ │ │ │ │ ├── linear.py
│ │ │ │ │ ├── modeling_outputs.py
│ │ │ │ │ ├── modeling_wav2vec2.py
│ │ │ │ │ ├── normalization.py
│ │ │ │ │ └── wav2vec2_model.py
│ │ │ │ ├── processing/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── signal_processing.py
│ │ │ │ │ └── speech_augmentation.py
│ │ │ │ └── wav2vec2_ASR.py
│ │ │ ├── wavlm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── activations.py
│ │ │ │ │ ├── functional.py
│ │ │ │ │ └── modules.py
│ │ │ │ ├── wavlm_asr.py
│ │ │ │ └── wavlm_paddle.py
│ │ │ └── whisper/
│ │ │ ├── __init__.py
│ │ │ ├── tokenizer.py
│ │ │ ├── utils.py
│ │ │ ├── whisper.py
│ │ │ └── whisper_LICENSE
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── align.py
│ │ │ ├── attention.py
│ │ │ ├── cmvn.py
│ │ │ ├── conformer_convolution.py
│ │ │ ├── conv2d.py
│ │ │ ├── crf.py
│ │ │ ├── ctc.py
│ │ │ ├── decoder.py
│ │ │ ├── decoder_layer.py
│ │ │ ├── embedding.py
│ │ │ ├── encoder.py
│ │ │ ├── encoder_layer.py
│ │ │ ├── fbank.py
│ │ │ ├── initializer.py
│ │ │ ├── loss.py
│ │ │ ├── mask.py
│ │ │ ├── positionwise_feed_forward.py
│ │ │ ├── subsampling.py
│ │ │ └── time_reduction.py
│ │ ├── training/
│ │ │ ├── __init__.py
│ │ │ ├── cli.py
│ │ │ ├── extensions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── extension.py
│ │ │ │ ├── plot.py
│ │ │ │ ├── snapshot.py
│ │ │ │ └── visualizer.py
│ │ │ ├── optimizer/
│ │ │ │ ├── __init__.py
│ │ │ │ └── adadelta.py
│ │ │ ├── reporter.py
│ │ │ ├── scheduler.py
│ │ │ ├── timer.py
│ │ │ ├── trainer.py
│ │ │ ├── triggers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── compare_value_trigger.py
│ │ │ │ ├── interval_trigger.py
│ │ │ │ ├── limit_trigger.py
│ │ │ │ ├── time_trigger.py
│ │ │ │ └── utils.py
│ │ │ └── updaters/
│ │ │ ├── __init__.py
│ │ │ ├── standard_updater.py
│ │ │ ├── trainer.py
│ │ │ └── updater.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── asr_utils.py
│ │ ├── bleu_score.py
│ │ ├── check_kwargs.py
│ │ ├── checkpoint.py
│ │ ├── cli_readers.py
│ │ ├── cli_utils.py
│ │ ├── cli_writers.py
│ │ ├── ctc_utils.py
│ │ ├── dynamic_import.py
│ │ ├── dynamic_pip_install.py
│ │ ├── error_rate.py
│ │ ├── layer_tools.py
│ │ ├── log.py
│ │ ├── mp_tools.py
│ │ ├── profiler.py
│ │ ├── socket_server.py
│ │ ├── spec_augment.py
│ │ ├── tensor_utils.py
│ │ ├── text_grid.py
│ │ └── utility.py
│ ├── server/
│ │ ├── README.md
│ │ ├── README_cn.md
│ │ ├── __init__.py
│ │ ├── base_commands.py
│ │ ├── bin/
│ │ │ ├── __init__.py
│ │ │ ├── paddlespeech_client.py
│ │ │ └── paddlespeech_server.py
│ │ ├── conf/
│ │ │ ├── application.yaml
│ │ │ ├── tts_online_application.yaml
│ │ │ ├── vector_application.yaml
│ │ │ ├── ws_conformer_application.yaml
│ │ │ ├── ws_conformer_wenetspeech_application_faster.yaml
│ │ │ └── ws_ds2_application.yaml
│ │ ├── engine/
│ │ │ ├── __init__.py
│ │ │ ├── acs/
│ │ │ │ ├── __init__.py
│ │ │ │ └── python/
│ │ │ │ ├── __init__.py
│ │ │ │ └── acs_engine.py
│ │ │ ├── asr/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── online/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── ctc_endpoint.py
│ │ │ │ │ ├── ctc_search.py
│ │ │ │ │ ├── onnx/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── asr_engine.py
│ │ │ │ │ ├── paddleinference/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── asr_engine.py
│ │ │ │ │ └── python/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── asr_engine.py
│ │ │ │ ├── paddleinference/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── asr_engine.py
│ │ │ │ └── python/
│ │ │ │ ├── __init__.py
│ │ │ │ └── asr_engine.py
│ │ │ ├── base_engine.py
│ │ │ ├── cls/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── paddleinference/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── cls_engine.py
│ │ │ │ └── python/
│ │ │ │ ├── __init__.py
│ │ │ │ └── cls_engine.py
│ │ │ ├── engine_factory.py
│ │ │ ├── engine_pool.py
│ │ │ ├── engine_warmup.py
│ │ │ ├── text/
│ │ │ │ ├── __init__.py
│ │ │ │ └── python/
│ │ │ │ ├── __init__.py
│ │ │ │ └── text_engine.py
│ │ │ ├── tts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── online/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── onnx/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── tts_engine.py
│ │ │ │ │ └── python/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── tts_engine.py
│ │ │ │ ├── paddleinference/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── tts_engine.py
│ │ │ │ └── python/
│ │ │ │ ├── __init__.py
│ │ │ │ └── tts_engine.py
│ │ │ └── vector/
│ │ │ ├── __init__.py
│ │ │ └── python/
│ │ │ ├── __init__.py
│ │ │ └── vector_engine.py
│ │ ├── entry.py
│ │ ├── executor.py
│ │ ├── restful/
│ │ │ ├── __init__.py
│ │ │ ├── acs_api.py
│ │ │ ├── api.py
│ │ │ ├── asr_api.py
│ │ │ ├── cls_api.py
│ │ │ ├── request.py
│ │ │ ├── response.py
│ │ │ ├── text_api.py
│ │ │ ├── tts_api.py
│ │ │ └── vector_api.py
│ │ ├── tests/
│ │ │ ├── __init__.py
│ │ │ ├── asr/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── offline/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── http_client.py
│ │ │ │ └── online/
│ │ │ │ ├── README.md
│ │ │ │ ├── README_cn.md
│ │ │ │ └── microphone_client.py
│ │ │ ├── text/
│ │ │ │ └── http_client.py
│ │ │ └── tts/
│ │ │ ├── offline/
│ │ │ │ └── http_client.py
│ │ │ └── online/
│ │ │ ├── http_client.py
│ │ │ └── ws_client.py
│ │ ├── util.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── audio_handler.py
│ │ │ ├── audio_process.py
│ │ │ ├── buffer.py
│ │ │ ├── config.py
│ │ │ ├── errors.py
│ │ │ ├── exception.py
│ │ │ ├── onnx_infer.py
│ │ │ ├── paddle_predictor.py
│ │ │ ├── util.py
│ │ │ └── vad.py
│ │ └── ws/
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── asr_api.py
│ │ └── tts_api.py
│ ├── t2s/
│ │ ├── __init__.py
│ │ ├── assets/
│ │ │ ├── __init__.py
│ │ │ ├── csmsc_test.txt
│ │ │ ├── sentences.txt
│ │ │ ├── sentences_canton.txt
│ │ │ ├── sentences_en.txt
│ │ │ ├── sentences_mix.txt
│ │ │ ├── sentences_sing.txt
│ │ │ └── sentences_ssml.txt
│ │ ├── audio/
│ │ │ ├── __init__.py
│ │ │ ├── audio.py
│ │ │ ├── codec.py
│ │ │ └── spec_normalizer.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── am_batch_fn.py
│ │ │ ├── batch.py
│ │ │ ├── data_table.py
│ │ │ ├── dataset.py
│ │ │ ├── get_feats.py
│ │ │ ├── ljspeech.py
│ │ │ ├── preprocess_utils.py
│ │ │ ├── sampler.py
│ │ │ └── vocoder_batch_fn.py
│ │ ├── exps/
│ │ │ ├── PTQ_dynamic.py
│ │ │ ├── PTQ_static.py
│ │ │ ├── __init__.py
│ │ │ ├── diffsinger/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gen_gta_mel.py
│ │ │ │ ├── get_minmax.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ └── train.py
│ │ │ ├── dygraph_to_static.py
│ │ │ ├── ernie_sat/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── align.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize.py
│ │ │ │ ├── synthesize_e2e.py
│ │ │ │ ├── train.py
│ │ │ │ └── utils.py
│ │ │ ├── fastspeech2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gen_gta_mel.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── train.py
│ │ │ │ └── vc2_infer.py
│ │ │ ├── gan_vocoder/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hifigan/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── train.py
│ │ │ │ ├── multi_band_melgan/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── train.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── parallelwave_gan/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── synthesize_from_wav.py
│ │ │ │ │ └── train.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── style_melgan/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── train.py
│ │ │ │ └── synthesize.py
│ │ │ ├── inference.py
│ │ │ ├── inference_streaming.py
│ │ │ ├── jets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── inference.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize.py
│ │ │ │ ├── synthesize_e2e.py
│ │ │ │ └── train.py
│ │ │ ├── lite_predict.py
│ │ │ ├── lite_predict_streaming.py
│ │ │ ├── lite_syn_utils.py
│ │ │ ├── ort_predict.py
│ │ │ ├── ort_predict_e2e.py
│ │ │ ├── ort_predict_streaming.py
│ │ │ ├── speedyspeech/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gen_gta_mel.py
│ │ │ │ ├── inference.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize_e2e.py
│ │ │ │ └── train.py
│ │ │ ├── starganv2_vc/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── train.py
│ │ │ │ └── vc.py
│ │ │ ├── stream_play_tts.py
│ │ │ ├── syn_utils.py
│ │ │ ├── synthesize.py
│ │ │ ├── synthesize_e2e.py
│ │ │ ├── synthesize_streaming.py
│ │ │ ├── tacotron2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── preprocess.py
│ │ │ │ └── train.py
│ │ │ ├── transformer_tts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize.py
│ │ │ │ ├── synthesize_e2e.py
│ │ │ │ └── train.py
│ │ │ ├── vits/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── inference.py
│ │ │ │ ├── lite_predict.py
│ │ │ │ ├── normalize.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize.py
│ │ │ │ ├── synthesize_e2e.py
│ │ │ │ ├── train.py
│ │ │ │ └── voice_cloning.py
│ │ │ ├── voice_cloning.py
│ │ │ ├── waveflow/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── ljspeech.py
│ │ │ │ ├── preprocess.py
│ │ │ │ ├── synthesize.py
│ │ │ │ └── train.py
│ │ │ └── wavernn/
│ │ │ ├── __init__.py
│ │ │ ├── synthesize.py
│ │ │ └── train.py
│ │ ├── frontend/
│ │ │ ├── __init__.py
│ │ │ ├── arpabet.py
│ │ │ ├── canton_frontend.py
│ │ │ ├── en_frontend.py
│ │ │ ├── g2pw/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── onnx_api.py
│ │ │ │ └── utils.py
│ │ │ ├── generate_lexicon.py
│ │ │ ├── mix_frontend.py
│ │ │ ├── normalizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── abbrrviation.py
│ │ │ │ ├── acronyms.py
│ │ │ │ ├── normalizer.py
│ │ │ │ ├── numbers.py
│ │ │ │ └── width.py
│ │ │ ├── phonectic.py
│ │ │ ├── polyphonic.py
│ │ │ ├── polyphonic.yaml
│ │ │ ├── punctuation.py
│ │ │ ├── rhy_prediction/
│ │ │ │ ├── __init__.py
│ │ │ │ └── rhy_predictor.py
│ │ │ ├── sing_frontend.py
│ │ │ ├── ssml/
│ │ │ │ ├── __init__.py
│ │ │ │ └── xml_processor.py
│ │ │ ├── tone_sandhi.py
│ │ │ ├── vocab.py
│ │ │ ├── zh_frontend.py
│ │ │ └── zh_normalization/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── char_convert.py
│ │ │ ├── chronology.py
│ │ │ ├── constants.py
│ │ │ ├── num.py
│ │ │ ├── phonecode.py
│ │ │ ├── quantifier.py
│ │ │ └── text_normlization.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── diffsinger/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── diffsinger.py
│ │ │ │ ├── diffsinger_updater.py
│ │ │ │ └── fastspeech2midi.py
│ │ │ ├── ernie_sat/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ernie_sat.py
│ │ │ │ └── ernie_sat_updater.py
│ │ │ ├── fastspeech2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fastspeech2.py
│ │ │ │ └── fastspeech2_updater.py
│ │ │ ├── hifigan/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── hifigan.py
│ │ │ │ └── hifigan_updater.py
│ │ │ ├── jets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alignments.py
│ │ │ │ ├── generator.py
│ │ │ │ ├── jets.py
│ │ │ │ ├── jets_updater.py
│ │ │ │ └── length_regulator.py
│ │ │ ├── melgan/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── melgan.py
│ │ │ │ ├── multi_band_melgan_updater.py
│ │ │ │ ├── style_melgan.py
│ │ │ │ └── style_melgan_updater.py
│ │ │ ├── parallel_wavegan/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── parallel_wavegan.py
│ │ │ │ └── parallel_wavegan_updater.py
│ │ │ ├── speedyspeech/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── speedyspeech.py
│ │ │ │ └── speedyspeech_updater.py
│ │ │ ├── starganv2_vc/
│ │ │ │ ├── AuxiliaryASR/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── config.yml
│ │ │ │ │ ├── layers.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── JDCNet/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── model.py
│ │ │ │ ├── __init__.py
│ │ │ │ ├── losses.py
│ │ │ │ ├── starganv2_vc.py
│ │ │ │ ├── starganv2_vc_updater.py
│ │ │ │ └── transforms.py
│ │ │ ├── tacotron2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── tacotron2.py
│ │ │ │ └── tacotron2_updater.py
│ │ │ ├── transformer_tts/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── transformer_tts.py
│ │ │ │ └── transformer_tts_updater.py
│ │ │ ├── vits/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── duration_predictor.py
│ │ │ │ ├── flow.py
│ │ │ │ ├── generator.py
│ │ │ │ ├── monotonic_align/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── core.pyx
│ │ │ │ │ └── setup.py
│ │ │ │ ├── posterior_encoder.py
│ │ │ │ ├── residual_coupling.py
│ │ │ │ ├── text_encoder.py
│ │ │ │ ├── transform.py
│ │ │ │ ├── vits.py
│ │ │ │ ├── vits_updater.py
│ │ │ │ └── wavenet/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── residual_block.py
│ │ │ │ └── wavenet.py
│ │ │ ├── waveflow.py
│ │ │ └── wavernn/
│ │ │ ├── __init__.py
│ │ │ ├── wavernn.py
│ │ │ └── wavernn_updater.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── adversarial_loss/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── gradient_reversal.py
│ │ │ │ └── speaker_classifier.py
│ │ │ ├── causal_conv.py
│ │ │ ├── conformer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── convolution.py
│ │ │ │ └── encoder_layer.py
│ │ │ ├── conv.py
│ │ │ ├── diffnet.py
│ │ │ ├── diffusion.py
│ │ │ ├── fftconv1d.py
│ │ │ ├── geometry.py
│ │ │ ├── layer_norm.py
│ │ │ ├── losses.py
│ │ │ ├── masked_fill.py
│ │ │ ├── nets_utils.py
│ │ │ ├── normalizer.py
│ │ │ ├── positional_encoding.py
│ │ │ ├── pqmf.py
│ │ │ ├── predictor/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── duration_predictor.py
│ │ │ │ ├── length_regulator.py
│ │ │ │ └── variance_predictor.py
│ │ │ ├── residual_block.py
│ │ │ ├── residual_stack.py
│ │ │ ├── style_encoder.py
│ │ │ ├── tacotron2/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── attentions.py
│ │ │ │ ├── decoder.py
│ │ │ │ └── encoder.py
│ │ │ ├── tade_res_block.py
│ │ │ ├── transformer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── attention.py
│ │ │ │ ├── decoder.py
│ │ │ │ ├── decoder_layer.py
│ │ │ │ ├── embedding.py
│ │ │ │ ├── encoder.py
│ │ │ │ ├── encoder_layer.py
│ │ │ │ ├── lightconv.py
│ │ │ │ ├── mask.py
│ │ │ │ ├── multi_layer_conv.py
│ │ │ │ ├── positionwise_feed_forward.py
│ │ │ │ ├── repeat.py
│ │ │ │ └── subsampling.py
│ │ │ ├── upsample.py
│ │ │ └── wavenet_denoiser.py
│ │ ├── training/
│ │ │ ├── __init__.py
│ │ │ ├── cli.py
│ │ │ ├── default_config.py
│ │ │ ├── experiment.py
│ │ │ ├── extension.py
│ │ │ ├── extensions/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── evaluator.py
│ │ │ │ ├── snapshot.py
│ │ │ │ └── visualizer.py
│ │ │ ├── optimizer.py
│ │ │ ├── reporter.py
│ │ │ ├── seeding.py
│ │ │ ├── trainer.py
│ │ │ ├── trigger.py
│ │ │ ├── triggers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── interval_trigger.py
│ │ │ │ ├── limit_trigger.py
│ │ │ │ └── time_trigger.py
│ │ │ ├── updater.py
│ │ │ └── updaters/
│ │ │ ├── __init__.py
│ │ │ └── standard_updater.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── checkpoint.py
│ │ ├── display.py
│ │ ├── error_rate.py
│ │ ├── h5_utils.py
│ │ ├── internals.py
│ │ ├── layer_tools.py
│ │ ├── mp_tools.py
│ │ ├── profiler.py
│ │ └── scheduler.py
│ ├── text/
│ │ ├── __init__.py
│ │ ├── exps/
│ │ │ ├── __init__.py
│ │ │ └── ernie_linear/
│ │ │ ├── __init__.py
│ │ │ ├── avg_model.py
│ │ │ ├── punc_restore.py
│ │ │ ├── test.py
│ │ │ └── train.py
│ │ └── models/
│ │ ├── __init__.py
│ │ ├── ernie_crf/
│ │ │ ├── __init__.py
│ │ │ └── model.py
│ │ └── ernie_linear/
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── ernie_linear.py
│ │ └── ernie_linear_updater.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── argparse.py
│ │ ├── dynamic_import.py
│ │ ├── env.py
│ │ └── initialize.py
│ └── vector/
│ ├── __init__.py
│ ├── cluster/
│ │ ├── __init__.py
│ │ ├── diarization.py
│ │ └── plda.py
│ ├── exps/
│ │ ├── __init__.py
│ │ ├── ecapa_tdnn/
│ │ │ ├── extract_emb.py
│ │ │ ├── test.py
│ │ │ └── train.py
│ │ └── ge2e/
│ │ ├── __init__.py
│ │ ├── audio_processor.py
│ │ ├── config.py
│ │ ├── dataset_processors.py
│ │ ├── inference.py
│ │ ├── preprocess.py
│ │ ├── random_cycle.py
│ │ ├── speaker_verification_dataset.py
│ │ └── train.py
│ ├── io/
│ │ ├── __init__.py
│ │ ├── augment.py
│ │ ├── batch.py
│ │ ├── dataset.py
│ │ ├── dataset_from_json.py
│ │ ├── embedding_norm.py
│ │ └── signal_processing.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── ecapa_tdnn.py
│ │ └── lstm_speaker_encoder.py
│ ├── modules/
│ │ ├── __init__.py
│ │ ├── layer.py
│ │ ├── loss.py
│ │ └── sid_model.py
│ ├── training/
│ │ ├── __init__.py
│ │ ├── scheduler.py
│ │ └── seeding.py
│ └── utils/
│ ├── __init__.py
│ ├── time.py
│ └── vector_utils.py
├── runtime/
│ ├── .clang-format
│ ├── .gitignore
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── build.sh
│ ├── build_android.sh
│ ├── build_ios.sh
│ ├── cmake/
│ │ ├── EnableCMP0048.cmake
│ │ ├── EnableCMP0077.cmake
│ │ ├── FindGFortranLibs.cmake
│ │ ├── absl.cmake
│ │ ├── boost.cmake
│ │ ├── eigen.cmake
│ │ ├── fastdeploy.cmake
│ │ ├── gflags.cmake
│ │ ├── glog.cmake
│ │ ├── gtest.cmake
│ │ ├── kenlm.cmake
│ │ ├── libsndfile.cmake
│ │ ├── openblas.cmake
│ │ ├── openfst.cmake
│ │ ├── paddleinference.cmake
│ │ ├── pybind.cmake
│ │ ├── summary.cmake
│ │ └── system.cmake
│ ├── docker/
│ │ └── .gitkeep
│ ├── engine/
│ │ ├── CMakeLists.txt
│ │ ├── asr/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── decoder/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── common.h
│ │ │ │ ├── ctc_beam_search_opt.h
│ │ │ │ ├── ctc_prefix_beam_search_decoder.cc
│ │ │ │ ├── ctc_prefix_beam_search_decoder.h
│ │ │ │ ├── ctc_prefix_beam_search_decoder_main.cc
│ │ │ │ ├── ctc_prefix_beam_search_score.h
│ │ │ │ ├── ctc_tlg_decoder.cc
│ │ │ │ ├── ctc_tlg_decoder.h
│ │ │ │ ├── ctc_tlg_decoder_main.cc
│ │ │ │ ├── decoder_itf.h
│ │ │ │ └── param.h
│ │ │ ├── nnet/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── decodable.cc
│ │ │ │ ├── decodable.h
│ │ │ │ ├── nnet_itf.h
│ │ │ │ ├── nnet_producer.cc
│ │ │ │ ├── nnet_producer.h
│ │ │ │ ├── u2_nnet.cc
│ │ │ │ ├── u2_nnet.h
│ │ │ │ ├── u2_nnet_main.cc
│ │ │ │ ├── u2_nnet_thread_main.cc
│ │ │ │ ├── u2_onnx_nnet.cc
│ │ │ │ └── u2_onnx_nnet.h
│ │ │ ├── recognizer/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── recognizer.cc
│ │ │ │ ├── recognizer.h
│ │ │ │ ├── recognizer_batch_main.cc
│ │ │ │ ├── recognizer_batch_main2.cc
│ │ │ │ ├── recognizer_controller.cc
│ │ │ │ ├── recognizer_controller.h
│ │ │ │ ├── recognizer_controller_impl.cc
│ │ │ │ ├── recognizer_controller_impl.h
│ │ │ │ ├── recognizer_instance.cc
│ │ │ │ ├── recognizer_instance.h
│ │ │ │ ├── recognizer_main.cc
│ │ │ │ └── recognizer_resource.h
│ │ │ └── server/
│ │ │ ├── CMakeLists.txt
│ │ │ └── websocket/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── websocket_client.cc
│ │ │ ├── websocket_client.h
│ │ │ ├── websocket_client_main.cc
│ │ │ ├── websocket_server.cc
│ │ │ ├── websocket_server.h
│ │ │ └── websocket_server_main.cc
│ │ ├── audio_classification/
│ │ │ ├── CMakeLists.txt
│ │ │ └── nnet/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── panns_interface.cc
│ │ │ ├── panns_interface.h
│ │ │ ├── panns_nnet.cc
│ │ │ ├── panns_nnet.h
│ │ │ └── panns_nnet_main.cc
│ │ ├── codelab/
│ │ │ ├── CMakeLists.txt
│ │ │ └── README.md
│ │ ├── common/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── base/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── basic_types.h
│ │ │ │ ├── common.h
│ │ │ │ ├── config.h
│ │ │ │ ├── flags.h.in
│ │ │ │ ├── glog_utils.cc
│ │ │ │ ├── glog_utils.h
│ │ │ │ ├── log.h.in
│ │ │ │ ├── log_impl.cc
│ │ │ │ ├── log_impl.h
│ │ │ │ ├── macros.h
│ │ │ │ ├── safe_queue.h
│ │ │ │ ├── safe_queue_inl.h
│ │ │ │ └── thread_pool.h
│ │ │ ├── frontend/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── assembler.cc
│ │ │ │ ├── assembler.h
│ │ │ │ ├── audio_cache.cc
│ │ │ │ ├── audio_cache.h
│ │ │ │ ├── cmvn.cc
│ │ │ │ ├── cmvn.h
│ │ │ │ ├── compute_fbank_main.cc
│ │ │ │ ├── compute_linear_spectrogram_main.cc
│ │ │ │ ├── data_cache.h
│ │ │ │ ├── db_norm.cc
│ │ │ │ ├── db_norm.h
│ │ │ │ ├── fbank.h
│ │ │ │ ├── feature-fbank.cc
│ │ │ │ ├── feature-fbank.h
│ │ │ │ ├── feature-functions.cc
│ │ │ │ ├── feature-functions.h
│ │ │ │ ├── feature-window.cc
│ │ │ │ ├── feature-window.h
│ │ │ │ ├── feature_cache.cc
│ │ │ │ ├── feature_cache.h
│ │ │ │ ├── feature_common.h
│ │ │ │ ├── feature_common_inl.h
│ │ │ │ ├── feature_pipeline.cc
│ │ │ │ ├── feature_pipeline.h
│ │ │ │ ├── fftsg.c
│ │ │ │ ├── frontend_itf.h
│ │ │ │ ├── linear_spectrogram.cc
│ │ │ │ ├── linear_spectrogram.h
│ │ │ │ ├── mel-computations.cc
│ │ │ │ ├── mel-computations.h
│ │ │ │ ├── normalizer.h
│ │ │ │ ├── rfft.cc
│ │ │ │ ├── rfft.h
│ │ │ │ ├── wave-reader.cc
│ │ │ │ └── wave-reader.h
│ │ │ ├── matrix/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── kaldi-matrix-inl.h
│ │ │ │ ├── kaldi-matrix.cc
│ │ │ │ ├── kaldi-matrix.h
│ │ │ │ ├── kaldi-vector-inl.h
│ │ │ │ ├── kaldi-vector.cc
│ │ │ │ ├── kaldi-vector.h
│ │ │ │ └── matrix-common.h
│ │ │ └── utils/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── audio_process.cc
│ │ │ ├── audio_process.h
│ │ │ ├── blank_process_test.cc
│ │ │ ├── file_utils.cc
│ │ │ ├── file_utils.h
│ │ │ ├── math.cc
│ │ │ ├── math.h
│ │ │ ├── picojson.h
│ │ │ ├── strings.cc
│ │ │ ├── strings.h
│ │ │ ├── strings_test.cc
│ │ │ ├── timer.cc
│ │ │ └── timer.h
│ │ ├── kaldi/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── base/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── io-funcs-inl.h
│ │ │ │ ├── io-funcs.cc
│ │ │ │ ├── io-funcs.h
│ │ │ │ ├── kaldi-common.h
│ │ │ │ ├── kaldi-error.cc
│ │ │ │ ├── kaldi-error.h
│ │ │ │ ├── kaldi-math.cc
│ │ │ │ ├── kaldi-math.h
│ │ │ │ ├── kaldi-types.h
│ │ │ │ ├── kaldi-utils.cc
│ │ │ │ ├── kaldi-utils.h
│ │ │ │ ├── timer.cc
│ │ │ │ ├── timer.h
│ │ │ │ └── version.h
│ │ │ ├── decoder/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── decodable-itf.h
│ │ │ │ ├── lattice-faster-decoder.cc
│ │ │ │ ├── lattice-faster-decoder.h
│ │ │ │ ├── lattice-faster-online-decoder.cc
│ │ │ │ └── lattice-faster-online-decoder.h
│ │ │ ├── fstbin/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── fstaddselfloops.cc
│ │ │ │ ├── fstdeterminizestar.cc
│ │ │ │ ├── fstisstochastic.cc
│ │ │ │ ├── fstminimizeencoded.cc
│ │ │ │ └── fsttablecompose.cc
│ │ │ ├── fstext/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── determinize-lattice-inl.h
│ │ │ │ ├── determinize-lattice.h
│ │ │ │ ├── determinize-star-inl.h
│ │ │ │ ├── determinize-star.h
│ │ │ │ ├── fstext-lib.h
│ │ │ │ ├── fstext-utils-inl.h
│ │ │ │ ├── fstext-utils.h
│ │ │ │ ├── kaldi-fst-io-inl.h
│ │ │ │ ├── kaldi-fst-io.cc
│ │ │ │ ├── kaldi-fst-io.h
│ │ │ │ ├── lattice-utils-inl.h
│ │ │ │ ├── lattice-utils.h
│ │ │ │ ├── lattice-weight.h
│ │ │ │ ├── pre-determinize-inl.h
│ │ │ │ ├── pre-determinize.h
│ │ │ │ ├── remove-eps-local-inl.h
│ │ │ │ ├── remove-eps-local.h
│ │ │ │ └── table-matcher.h
│ │ │ ├── lat/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── determinize-lattice-pruned.cc
│ │ │ │ ├── determinize-lattice-pruned.h
│ │ │ │ ├── kaldi-lattice.cc
│ │ │ │ ├── kaldi-lattice.h
│ │ │ │ ├── lattice-functions.cc
│ │ │ │ └── lattice-functions.h
│ │ │ ├── lm/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── arpa-file-parser.cc
│ │ │ │ ├── arpa-file-parser.h
│ │ │ │ ├── arpa-lm-compiler.cc
│ │ │ │ └── arpa-lm-compiler.h
│ │ │ ├── lmbin/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ └── arpa2fst.cc
│ │ │ └── util/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── basic-filebuf.h
│ │ │ ├── common-utils.h
│ │ │ ├── const-integer-set-inl.h
│ │ │ ├── const-integer-set.h
│ │ │ ├── edit-distance-inl.h
│ │ │ ├── edit-distance.h
│ │ │ ├── hash-list-inl.h
│ │ │ ├── hash-list.h
│ │ │ ├── kaldi-cygwin-io-inl.h
│ │ │ ├── kaldi-holder-inl.h
│ │ │ ├── kaldi-holder.cc
│ │ │ ├── kaldi-holder.h
│ │ │ ├── kaldi-io-inl.h
│ │ │ ├── kaldi-io.cc
│ │ │ ├── kaldi-io.h
│ │ │ ├── kaldi-pipebuf.h
│ │ │ ├── kaldi-semaphore.cc
│ │ │ ├── kaldi-semaphore.h
│ │ │ ├── kaldi-table-inl.h
│ │ │ ├── kaldi-table.cc
│ │ │ ├── kaldi-table.h
│ │ │ ├── kaldi-thread.cc
│ │ │ ├── kaldi-thread.h
│ │ │ ├── options-itf.h
│ │ │ ├── parse-options.cc
│ │ │ ├── parse-options.h
│ │ │ ├── simple-io-funcs.cc
│ │ │ ├── simple-io-funcs.h
│ │ │ ├── simple-options.cc
│ │ │ ├── simple-options.h
│ │ │ ├── stl-utils.h
│ │ │ ├── table-types.h
│ │ │ ├── text-utils.cc
│ │ │ └── text-utils.h
│ │ └── vad/
│ │ ├── CMakeLists.txt
│ │ ├── frontend/
│ │ │ └── wav.h
│ │ ├── interface/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── vad_interface.cc
│ │ │ ├── vad_interface.h
│ │ │ └── vad_interface_main.cc
│ │ └── nnet/
│ │ ├── CMakeLists.txt
│ │ ├── vad.cc
│ │ ├── vad.h
│ │ └── vad_nnet_main.cc
│ ├── examples/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── android/
│ │ │ └── VadJni/
│ │ │ ├── .gitignore
│ │ │ ├── app/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── build.gradle
│ │ │ │ ├── libs/
│ │ │ │ │ └── .gitkeep
│ │ │ │ ├── proguard-rules.pro
│ │ │ │ └── src/
│ │ │ │ ├── androidTest/
│ │ │ │ │ └── java/
│ │ │ │ │ └── com/
│ │ │ │ │ └── baidu/
│ │ │ │ │ └── paddlespeech/
│ │ │ │ │ └── vadjni/
│ │ │ │ │ └── ExampleInstrumentedTest.java
│ │ │ │ └── main/
│ │ │ │ ├── AndroidManifest.xml
│ │ │ │ ├── assets/
│ │ │ │ │ └── .gitkeep
│ │ │ │ ├── cpp/
│ │ │ │ │ ├── CMakeLists.txt
│ │ │ │ │ ├── native-lib.cpp
│ │ │ │ │ └── vad_interface.h
│ │ │ │ ├── java/
│ │ │ │ │ └── com/
│ │ │ │ │ └── baidu/
│ │ │ │ │ └── paddlespeech/
│ │ │ │ │ └── vadjni/
│ │ │ │ │ └── MainActivity.java
│ │ │ │ └── res/
│ │ │ │ ├── drawable/
│ │ │ │ │ └── ic_launcher_background.xml
│ │ │ │ ├── drawable-v24/
│ │ │ │ │ └── ic_launcher_foreground.xml
│ │ │ │ ├── layout/
│ │ │ │ │ └── activity_main.xml
│ │ │ │ ├── mipmap-anydpi-v26/
│ │ │ │ │ ├── ic_launcher.xml
│ │ │ │ │ └── ic_launcher_round.xml
│ │ │ │ ├── mipmap-anydpi-v33/
│ │ │ │ │ └── ic_launcher.xml
│ │ │ │ ├── values/
│ │ │ │ │ ├── colors.xml
│ │ │ │ │ ├── strings.xml
│ │ │ │ │ └── themes.xml
│ │ │ │ ├── values-night/
│ │ │ │ │ └── themes.xml
│ │ │ │ └── xml/
│ │ │ │ ├── backup_rules.xml
│ │ │ │ └── data_extraction_rules.xml
│ │ │ ├── build.gradle
│ │ │ ├── gradle/
│ │ │ │ └── wrapper/
│ │ │ │ ├── gradle-wrapper.jar
│ │ │ │ └── gradle-wrapper.properties
│ │ │ ├── gradle.properties
│ │ │ ├── gradlew
│ │ │ ├── gradlew.bat
│ │ │ └── settings.gradle
│ │ ├── audio_classification/
│ │ │ ├── README.md
│ │ │ ├── android_demo/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── app/
│ │ │ │ │ ├── .gitignore
│ │ │ │ │ ├── build.gradle
│ │ │ │ │ ├── proguard-rules.pro
│ │ │ │ │ └── src/
│ │ │ │ │ ├── androidTest/
│ │ │ │ │ │ └── java/
│ │ │ │ │ │ └── com/
│ │ │ │ │ │ └── example/
│ │ │ │ │ │ └── cls/
│ │ │ │ │ │ └── ExampleInstrumentedTest.kt
│ │ │ │ │ └── main/
│ │ │ │ │ ├── AndroidManifest.xml
│ │ │ │ │ ├── cpp/
│ │ │ │ │ │ ├── CMakeLists.txt
│ │ │ │ │ │ ├── includes/
│ │ │ │ │ │ │ └── panns_interface.h
│ │ │ │ │ │ └── native-lib.cpp
│ │ │ │ │ ├── java/
│ │ │ │ │ │ └── com/
│ │ │ │ │ │ └── example/
│ │ │ │ │ │ └── cls/
│ │ │ │ │ │ └── MainActivity.kt
│ │ │ │ │ └── res/
│ │ │ │ │ ├── drawable/
│ │ │ │ │ │ └── ic_launcher_background.xml
│ │ │ │ │ ├── drawable-v24/
│ │ │ │ │ │ └── ic_launcher_foreground.xml
│ │ │ │ │ ├── layout/
│ │ │ │ │ │ └── activity_main.xml
│ │ │ │ │ ├── mipmap-anydpi-v26/
│ │ │ │ │ │ ├── ic_launcher.xml
│ │ │ │ │ │ └── ic_launcher_round.xml
│ │ │ │ │ ├── values/
│ │ │ │ │ │ ├── colors.xml
│ │ │ │ │ │ ├── strings.xml
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ ├── values-night/
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ └── xml/
│ │ │ │ │ ├── backup_rules.xml
│ │ │ │ │ └── data_extraction_rules.xml
│ │ │ │ ├── build.gradle
│ │ │ │ ├── gradle/
│ │ │ │ │ └── wrapper/
│ │ │ │ │ ├── gradle-wrapper.jar
│ │ │ │ │ └── gradle-wrapper.properties
│ │ │ │ ├── gradle.properties
│ │ │ │ ├── gradlew
│ │ │ │ ├── gradlew.bat
│ │ │ │ └── settings.gradle
│ │ │ ├── conf
│ │ │ ├── label_list
│ │ │ └── scp
│ │ ├── codelab/
│ │ │ ├── README.md
│ │ │ ├── decoder/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── README.md
│ │ │ │ ├── path.sh
│ │ │ │ ├── run.sh
│ │ │ │ └── valgrind.sh
│ │ │ ├── feat/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── README.md
│ │ │ │ ├── path.sh
│ │ │ │ ├── run.sh
│ │ │ │ └── valgrind.sh
│ │ │ ├── nnet/
│ │ │ │ ├── .gitignore
│ │ │ │ ├── README.md
│ │ │ │ ├── path.sh
│ │ │ │ ├── run.sh
│ │ │ │ └── valgrind.sh
│ │ │ └── u2/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ ├── decode.sh
│ │ │ │ ├── feat.sh
│ │ │ │ ├── nnet.sh
│ │ │ │ └── recognizer.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── custom_asr/
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ ├── compile_lexicon_token_fst.sh
│ │ │ │ ├── mk_slot_graph.sh
│ │ │ │ ├── mk_tlg_with_slot.sh
│ │ │ │ └── train_lm_with_slot.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── text_lm/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── local/
│ │ │ │ └── mmseg.py
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ ├── u2pp_ol/
│ │ │ ├── README.md
│ │ │ └── wenetspeech/
│ │ │ ├── .gitignore
│ │ │ ├── README.md
│ │ │ ├── RESULTS.md
│ │ │ ├── local/
│ │ │ │ ├── aishell_train_lms.sh
│ │ │ │ ├── decode.sh
│ │ │ │ ├── feat.sh
│ │ │ │ ├── nnet.sh
│ │ │ │ ├── recognizer.sh
│ │ │ │ ├── recognizer_fastdeploy.sh
│ │ │ │ ├── recognizer_quant.sh
│ │ │ │ ├── recognizer_wfst.sh
│ │ │ │ ├── recognizer_wfst_fastdeploy.sh
│ │ │ │ ├── run_build_tlg.sh
│ │ │ │ └── split_data.sh
│ │ │ ├── path.sh
│ │ │ └── run.sh
│ │ └── vad/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── conf/
│ │ │ └── vad.ini
│ │ ├── local/
│ │ │ ├── build.sh
│ │ │ ├── build_android.sh
│ │ │ ├── decode.sh
│ │ │ └── download.sh
│ │ ├── path.sh
│ │ ├── run.sh
│ │ └── vad-android-demo/
│ │ ├── .gradle/
│ │ │ ├── 6.1.1/
│ │ │ │ └── gc.properties
│ │ │ ├── buildOutputCleanup/
│ │ │ │ └── cache.properties
│ │ │ └── vcs-1/
│ │ │ └── gc.properties
│ │ ├── LICENSE.md
│ │ ├── README
│ │ ├── README.md
│ │ ├── build.gradle
│ │ ├── example/
│ │ │ ├── .gitignore
│ │ │ ├── build.gradle
│ │ │ ├── local.properties
│ │ │ ├── proguard-rules.pro
│ │ │ └── src/
│ │ │ ├── androidTest/
│ │ │ │ └── java/
│ │ │ │ └── com/
│ │ │ │ └── konovalov/
│ │ │ │ └── vad/
│ │ │ │ └── example/
│ │ │ │ └── ExampleInstrumentedTest.java
│ │ │ └── main/
│ │ │ ├── AndroidManifest.xml
│ │ │ ├── java/
│ │ │ │ └── com/
│ │ │ │ └── konovalov/
│ │ │ │ └── vad/
│ │ │ │ └── example/
│ │ │ │ ├── MainActivity.java
│ │ │ │ └── recorder/
│ │ │ │ ├── VoiceRecorder.java
│ │ │ │ └── VoiceRecorderConfig.java
│ │ │ └── res/
│ │ │ ├── drawable/
│ │ │ │ └── ic_launcher_background.xml
│ │ │ ├── drawable-v24/
│ │ │ │ └── ic_launcher_foreground.xml
│ │ │ ├── layout/
│ │ │ │ └── activity_main.xml
│ │ │ ├── mipmap-anydpi-v26/
│ │ │ │ ├── ic_launcher.xml
│ │ │ │ └── ic_launcher_round.xml
│ │ │ └── values/
│ │ │ ├── colors.xml
│ │ │ ├── strings.xml
│ │ │ └── styles.xml
│ │ ├── gradle/
│ │ │ └── wrapper/
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ │ ├── gradle.properties
│ │ ├── gradlew
│ │ ├── gradlew.bat
│ │ ├── local.properties
│ │ ├── settings.gradle
│ │ └── vad/
│ │ ├── .gitignore
│ │ ├── build.gradle
│ │ ├── consumer-rules.pro
│ │ ├── proguard-rules.pro
│ │ └── src/
│ │ ├── androidTest/
│ │ │ └── java/
│ │ │ └── com/
│ │ │ └── konovalov/
│ │ │ └── vad/
│ │ │ └── ExampleInstrumentedTest.java
│ │ └── main/
│ │ ├── AndroidManifest.xml
│ │ ├── cpp/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── includes/
│ │ │ │ └── vad_interface.h
│ │ │ └── native-lib.cpp
│ │ ├── java/
│ │ │ └── com/
│ │ │ └── konovalov/
│ │ │ └── vad/
│ │ │ ├── Vad.java
│ │ │ └── VadListener.java
│ │ └── res/
│ │ └── values/
│ │ └── strings.xml
│ ├── patch/
│ │ ├── CPPLINT.cfg
│ │ ├── README.md
│ │ └── openfst/
│ │ └── src/
│ │ ├── include/
│ │ │ └── fst/
│ │ │ ├── flags.h
│ │ │ └── log.h
│ │ └── lib/
│ │ └── flags.cc
│ └── tools/
│ ├── clang-format.sh
│ ├── setup_valgrind.sh
│ └── venv.sh
├── setup.cfg
├── setup.py
├── tests/
│ ├── benchmark/
│ │ ├── conformer/
│ │ │ ├── README.md
│ │ │ ├── prepare.sh
│ │ │ ├── run.sh
│ │ │ └── run_benchmark.sh
│ │ └── pwgan/
│ │ ├── README.md
│ │ ├── run_all.sh
│ │ └── run_benchmark.sh
│ ├── chains/
│ │ ├── ds2/
│ │ │ ├── README.md
│ │ │ ├── ds2_params_lite_train_infer.txt
│ │ │ ├── ds2_params_whole_train_infer.txt
│ │ │ ├── lite_train_infer.sh
│ │ │ ├── prepare.sh
│ │ │ ├── speedyspeech_params_lite.txt
│ │ │ ├── test.sh
│ │ │ └── whole_train_infer.sh
│ │ └── speedyspeech/
│ │ ├── README.md
│ │ ├── infer.sh
│ │ ├── lite_train_infer.sh
│ │ ├── prepare.sh
│ │ ├── speedyspeech_params_lite_multi_gpu.txt
│ │ ├── speedyspeech_params_lite_single_gpu.txt
│ │ ├── speedyspeech_params_whole_multi_gpu.txt
│ │ ├── speedyspeech_params_whole_single_gpu.txt
│ │ ├── test.sh
│ │ └── whole_train_infer.sh
│ ├── test_tipc/
│ │ ├── barrier.sh
│ │ ├── benchmark_train.sh
│ │ ├── common_func.sh
│ │ ├── configs/
│ │ │ ├── conformer/
│ │ │ │ └── train_infer_python.txt
│ │ │ ├── mdtc/
│ │ │ │ └── train_infer_python.txt
│ │ │ └── pwgan/
│ │ │ └── train_infer_python.txt
│ │ ├── conformer/
│ │ │ └── scripts/
│ │ │ └── aishell_tiny.py
│ │ ├── docs/
│ │ │ └── benchmark_train.md
│ │ ├── prepare.sh
│ │ └── test_train_inference_python.sh
│ └── unit/
│ ├── asr/
│ │ ├── deepspeech2_model_test.py
│ │ ├── deepspeech2_online_model_test.py
│ │ ├── deepspeech2_online_model_test.sh
│ │ ├── error_rate_test.py
│ │ ├── mask_test.py
│ │ ├── reverse_pad_list.py
│ │ └── u2_model_test.py
│ ├── audiotools/
│ │ ├── core/
│ │ │ ├── test_audio_signal.py
│ │ │ ├── test_bands.py
│ │ │ ├── test_display.py
│ │ │ ├── test_dsp.py
│ │ │ ├── test_effects.py
│ │ │ ├── test_fftconv.py
│ │ │ ├── test_grad.py
│ │ │ ├── test_highpass.py
│ │ │ ├── test_loudness.py
│ │ │ ├── test_lowpass.py
│ │ │ └── test_util.py
│ │ ├── data/
│ │ │ ├── test_datasets.py
│ │ │ ├── test_preprocess.py
│ │ │ └── test_transforms.py
│ │ ├── ml/
│ │ │ ├── test_decorators.py
│ │ │ └── test_model.py
│ │ ├── test_audiotools.sh
│ │ └── test_post.py
│ ├── ci.sh
│ ├── cli/
│ │ ├── aishell_test_prepare.py
│ │ ├── calc_RTF_CER_by_aishell.sh
│ │ ├── path.sh
│ │ └── test_cli.sh
│ ├── doc/
│ │ └── test_cli.md
│ ├── server/
│ │ ├── offline/
│ │ │ ├── change_yaml.py
│ │ │ ├── conf/
│ │ │ │ └── application.yaml
│ │ │ └── test_server_client.sh
│ │ └── online/
│ │ └── tts/
│ │ ├── check_server/
│ │ │ ├── change_yaml.py
│ │ │ ├── conf/
│ │ │ │ └── application.yaml
│ │ │ ├── test.sh
│ │ │ ├── test_all.sh
│ │ │ └── tts_online_application.yaml
│ │ └── test_server/
│ │ └── test_http_client.py
│ ├── tts/
│ │ ├── test_data_table.py
│ │ ├── test_enfrontend.py
│ │ ├── test_expansion.py
│ │ ├── test_fftconv1d.py
│ │ ├── test_losses.py
│ │ ├── test_mixfrontend.py
│ │ ├── test_optimizer.py
│ │ ├── test_pwg.py
│ │ ├── test_raise.py
│ │ ├── test_reporter.py
│ │ ├── test_snapshot.py
│ │ ├── test_ssml.py
│ │ ├── test_stft.py
│ │ └── test_to_static.py
│ └── vector/
│ ├── conftest.py
│ └── test_augment.py
├── third_party/
│ ├── README.md
│ ├── __init__.py
│ ├── ctc_decoders/
│ │ ├── .gitignore
│ │ ├── COPYING.APACHE2.0
│ │ ├── COPYING.LESSER.3
│ │ ├── LICENSE
│ │ ├── __init__.py
│ │ ├── ctc_beam_search_decoder.cpp
│ │ ├── ctc_beam_search_decoder.h
│ │ ├── ctc_greedy_decoder.cpp
│ │ ├── ctc_greedy_decoder.h
│ │ ├── decoder_utils.cpp
│ │ ├── decoder_utils.h
│ │ ├── decoders.i
│ │ ├── path_trie.cpp
│ │ ├── path_trie.h
│ │ ├── scorer.cpp
│ │ ├── scorer.h
│ │ ├── setup.py
│ │ └── setup.sh
│ ├── install.sh
│ ├── install_win_ctc.bat
│ └── python_kaldi_features/
│ ├── .gitignore
│ ├── LICENSE
│ ├── MANIFEST
│ ├── README.rst
│ ├── docs/
│ │ ├── Makefile
│ │ ├── make.bat
│ │ └── source/
│ │ ├── conf.py
│ │ └── index.rst
│ ├── example.py
│ ├── python_speech_features/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── base_orig.py
│ │ ├── sigproc.py
│ │ └── sigproc_orig.py
│ ├── requirements.txt
│ ├── setup.py
│ └── test/
│ └── test_sigproc.py
├── tools/
│ ├── Dockerfile
│ ├── Makefile
│ ├── extras/
│ │ ├── README.md
│ │ ├── install_autolog.sh
│ │ ├── install_gcc.sh
│ │ ├── install_kaldi.sh
│ │ ├── install_kenlm.sh
│ │ ├── install_liblbfgs.sh
│ │ ├── install_mfa_v1.sh
│ │ ├── install_mfa_v2.sh
│ │ ├── install_miniconda.sh
│ │ ├── install_mkl.sh
│ │ ├── install_ngram.sh
│ │ ├── install_openblas.sh
│ │ ├── install_openfst.sh
│ │ ├── install_pynini.sh
│ │ ├── install_sclite.sh
│ │ ├── install_soundfile.sh
│ │ ├── install_sox.sh
│ │ ├── install_srilm.sh
│ │ ├── install_venv.sh
│ │ └── srilm.patch
│ ├── get_contributors.ipynb
│ ├── pre_commit.sh
│ ├── release_note.py
│ ├── setup_anaconda.sh
│ └── watermark.py
└── utils/
├── DER.py
├── README.md
├── __init__.py
├── addjson.py
├── apply-cmvn.py
├── avg.sh
├── avg_model.py
├── build_kenlm_model_from_arpa.sh
├── build_vocab.py
├── caculate_rtf.py
├── compute-cmvn-stats.py
├── compute-wer.py
├── compute_mean_std.py
├── compute_statistics.py
├── copy-feats.py
├── data2json.sh
├── dump.sh
├── dump_manifest.py
├── duration_from_maniefst.sh
├── espnet_json_to_manifest.py
├── feat-to-shape.py
├── feat_to_shape.sh
├── filter.py
├── filter_scp.pl
├── format_data.py
├── format_rsl.py
├── format_triplet_data.py
├── fst/
│ ├── add_lex_disambig.pl
│ ├── compile_lexicon_token_fst.sh
│ ├── ctc_token_fst.py
│ ├── ctc_token_fst_corrected.py
│ ├── eps2disambig.pl
│ ├── make_lexicon_fst.pl
│ ├── make_tlg.sh
│ ├── prepare_dict.py
│ ├── remove_oovs.pl
│ ├── rnnt_token_fst.py
│ └── s2eps.pl
├── gen_duration_from_textgrid.py
├── generate_infer_yaml.py
├── json2trn.py
├── link_wav.py
├── log.sh
├── manifest_key_value.py
├── md-eval.pl
├── merge_scp2json.py
├── ngram_train.sh
├── pack_model.sh
├── parallel/
│ └── run.pl
├── parse_options.sh
├── pd_env_collect.sh
├── profile.sh
├── reduce_data_dir.sh
├── remove_longshortdata.py
├── remove_longshortdata.sh
├── score_sclite.sh
├── scp2json.py
├── show_results.sh
├── spk2utt_to_utt2spk.pl
├── split_data.sh
├── split_json.sh
├── split_scp.pl
├── spm_decode
├── spm_encode
├── spm_train
├── tarball.sh
├── text2token.py
├── text_to_lexicon.py
├── tokenizer.perl
├── train_arpa_with_kenlm.sh
├── update_json.sh
├── utility.sh
├── utt2spk_to_spk2utt.pl
└── zh_tn.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .clang-format
================================================
# This file is used by clang-format to autoformat paddle source code
#
# The clang-format is part of llvm toolchain.
# It need to install llvm and clang to format source code style.
#
# The basic usage is,
# clang-format -i -style=file PATH/TO/SOURCE/CODE
#
# The -style=file implicit use ".clang-format" file located in one of
# parent directory.
# The -i means inplace change.
#
# The document of clang-format is
# http://clang.llvm.org/docs/ClangFormat.html
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
Language: Cpp
BasedOnStyle: Google
IndentWidth: 4
TabWidth: 4
ContinuationIndentWidth: 4
MaxEmptyLinesToKeep: 2
AccessModifierOffset: -2 # The private/protected/public has no indent in class
Standard: Cpp11
AllowAllParametersOfDeclarationOnNextLine: true
BinPackParameters: false
BinPackArguments: false
...
================================================
FILE: .flake8
================================================
[flake8]
########## OPTIONS ##########
# Set the maximum length that any line (with some exceptions) may be.
max-line-length = 120
################### FILE PATTERNS ##########################
# Provide a comma-separated list of glob patterns to exclude from checks.
exclude =
# git folder
.git,
# python cache
__pycache__,
# third party
utils/compute-wer.py,
third_party/,
# Provide a comma-separate list of glob patterns to include for checks.
filename =
*.py
########## RULES ##########
# ERROR CODES
#
# E/W - PEP8 errors/warnings (pycodestyle)
# F - linting errors (pyflakes)
# C - McCabe complexity error (mccabe)
#
# W503 - line break before binary operator
# Specify a list of codes to ignore.
ignore =
W503
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125,E129
W291,W293,W605
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,
# these ignores are from flake8-comprehensions; please fix!
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
per-file-ignores =
*/__init__.py: F401
# Specify the list of error codes you wish Flake8 to report.
select =
E,
W,
F,
C
================================================
FILE: .gitconfig
================================================
[alias]
st = status
ci = commit
br = branch
co = checkout
df = diff
l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
ll = log --stat
[merge]
tool = vimdiff
[core]
excludesfile = ~/.gitignore
editor = vim
[color]
branch = auto
diff = auto
status = auto
[color "branch"]
current = yellow reverse
local = yellow
remote = green
[color "diff"]
meta = yellow bold
frag = magenta bold
old = red bold
new = green bold
[color "status"]
added = yellow
changed = green
untracked = cyan
[push]
default = matching
[credential]
helper = store
[user]
name =
email =
================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Racial or political allusions
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at paddlespeech@baidu.com. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
================================================
FILE: .github/CONTRIBUTING.md
================================================
# 💡 paddlespeech 提交代码须知
### Discussed in https://github.com/PaddlePaddle/PaddleSpeech/discussions/1326
Originally posted by **yt605155624** January 12, 2022
1. 写完代码之后可以用我们的 pre-commit 检查一下代码格式,注意只改自己修改的代码的格式即可,其他的代码有可能也被改了格式,不要 add 就好
```
pip install pre-commit
pre-commit run --file 你修改的代码
```
2. 提交 commit 中增加必要信息跳过不必要的 CI
- 提交 asr 相关代码
```text
git commit -m "xxxxxx, test=asr"
```
- 提交 tts 相关代码
```text
git commit -m "xxxxxx, test=tts"
```
- 仅修改文档
```text
git commit -m "xxxxxx, test=doc"
```
注意:
1. 虽然跳过了 CI,但是还要先排队排到才能跳过,所以非自己方向看到 pending 不要着急 🤣
2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`,因为每个 commit 都会触发 CI
4. 删除 python 环境中已经安装好的 paddlespeech,否则可能会影响 import paddlespeech 的顺序
================================================
FILE: .github/ISSUE_TEMPLATE/bug-report-s2t.md
================================================
---
name: "\U0001F41B S2T Bug Report"
about: Create a report to help us improve
title: "[S2T]XXXX"
labels: Bug, S2T
assignees: zh794390558
---
For support and discussions, please use our [Discourse forums](https://github.com/PaddlePaddle/DeepSpeech/discussions).
If you've found a bug then please create an issue with the following information:
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Environment (please complete the following information):**
- OS: [e.g. Ubuntu]
- GCC/G++ Version [e.g. 8.3]
- Python Version [e.g. 3.7]
- PaddlePaddle Version [e.g. 2.0.0]
- Model Version [e.g. 2.0.0]
- GPU/DRIVER Information [e.g. Tesla V100-SXM2-32GB/440.64.00]
- CUDA/CUDNN Version [e.g. cuda-10.2]
- MKL Version
- TensorRT Version
**Additional context**
Add any other context about the problem here.
================================================
FILE: .github/ISSUE_TEMPLATE/bug-report-tts.md
================================================
---
name: "\U0001F41B TTS Bug Report"
about: Create a report to help us improve
title: "[TTS]XXXX"
labels: Bug, T2S
---
For support and discussions, please use our [Discourse forums](https://github.com/PaddlePaddle/DeepSpeech/discussions).
If you've found a bug then please create an issue with the following information:
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Environment (please complete the following information):**
- OS: [e.g. Ubuntu]
- GCC/G++ Version [e.g. 8.3]
- Python Version [e.g. 3.7]
- PaddlePaddle Version [e.g. 2.0.0]
- Model Version [e.g. 2.0.0]
- GPU/DRIVER Information [e.g. Tesla V100-SXM2-32GB/440.64.00]
- CUDA/CUDNN Version [e.g. cuda-10.2]
- MKL Version
- TensorRT Version
**Additional context**
Add any other context about the problem here.
================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.md
================================================
---
name: "\U0001F680 Feature Request"
about: As a user, I want to request a New Feature on the product.
title: ''
labels: feature request
assignees: D-DanielYang, iftaken
---
## Feature Request
**Is your feature request related to a problem? Please describe:**
**Describe the feature you'd like:**
**Describe alternatives you've considered:**
================================================
FILE: .github/ISSUE_TEMPLATE/others.md
================================================
---
name: "\U0001F9E9 Others"
about: Report any other non-support related issues.
title: ''
labels: ''
assignees: ''
---
## Others
================================================
FILE: .github/ISSUE_TEMPLATE/question.md
================================================
---
name: "\U0001F914 Ask a Question"
about: I want to ask a question.
title: ''
labels: Question
assignees: ''
---
## General Question
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
### PR types
### PR changes
### Describe
================================================
FILE: .github/stale.yml
================================================
# Number of days of inactivity before an issue becomes stale
daysUntilStale: 45
# Number of days of inactivity before a stale issue is closed
daysUntilClose: 30
# Issues with these labels will never be considered stale
exemptLabels:
- Roadmap
- Bug
- feature request
- Tips
# Label to use when marking an issue as stale
staleLabel: Stale
# Comment to post when marking an issue as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
unmarkComment: false
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: >
This issue is closed. Please re-open if needed.
================================================
FILE: .gitignore
================================================
.DS_Store
*.pyc
.vscode
*.log
*.wav
*.pdmodel
*.pdiparams*
*.zip
*.tar
*.tar.gz
.ipynb_checkpoints
*.npz
*.done
*.whl
*.egg-info
build
*output/
.history
.idea
audio/dist/
audio/fc_patch/
docs/build/
docs/topic/ctc/warp-ctc/
tools/venv
tools/kenlm
tools/sox-14.4.2
tools/soxbindings
tools/montreal-forced-aligner/
tools/Montreal-Forced-Aligner/
tools/sctk
tools/sctk-20159b5/
tools/kaldi
tools/OpenBLAS/
tools/Miniconda3-latest-Linux-x86_64.sh
tools/activate_python.sh
tools/miniconda.sh
tools/CRF++-0.58/
tools/liblbfgs-1.10/
tools/srilm/
tools/env.sh
tools/openfst-1.8.1/
tools/libsndfile/
tools/python-soundfile/
tools/onnx
tools/onnxruntime
tools/Paddle2ONNX
tools/onnx-simplifier/
speechx/fc_patch/
third_party/ctc_decoders/paddlespeech_ctcdecoders.py
kernel_meta/
================================================
FILE: .mergify.yml
================================================
pull_request_rules:
- name: automatic merge for develop when CI passes and 1 reviews
conditions:
- "approved-reviews-by>=1"
- check-success=Travis CI - Pull Request
- base=develop
actions:
merge:
method: merge
- name: delete head branch after merged
conditions:
- merged
actions:
delete_head_branch: {}
- name: "add label=auto-merge for PR by mergify"
conditions:
- author=mergify[bot]
actions:
label:
add: ["auto-merge"]
- name: warn on conflicts
conditions:
- conflict
actions:
comment:
message: This pull request is now in conflict :(
label:
add: ["conflicts"]
- name: unlabel conflicts
conditions:
- -conflict
actions:
label:
remove: ["conflicts"]
- name: "auto add label=Dataset"
conditions:
- files~=^dataset/
actions:
label:
add: ["Dataset"]
- name: "auto add label=S2T"
conditions:
- files~=^paddlespeech/s2t/
actions:
label:
add: ["S2T"]
- name: "auto add label=T2S"
conditions:
- files~=^paddlespeech/t2s/
actions:
label:
add: ["T2S"]
- name: "auto add label=Audio"
conditions:
- files~=^paddlespeech/audio/
actions:
label:
add: ["Audio"]
- name: "auto add label=Vector"
conditions:
- files~=^paddlespeech/vector/
actions:
label:
add: ["Vector"]
- name: "auto add label=Text"
conditions:
- files~=^paddlespeech/text/
actions:
label:
add: ["Text"]
- name: "auto add label=Example"
conditions:
- files~=^examples/
actions:
label:
add: ["Example"]
- name: "auto add label=CLI"
conditions:
- files~=^paddlespeech/cli
actions:
label:
add: ["CLI"]
- name: "auto add label=Server"
conditions:
- files~=^paddlespeech/server
actions:
label:
add: ["Server"]
- name: "auto add label=Demo"
conditions:
- files~=^demos/
actions:
label:
add: ["Demo"]
- name: "auto add label=README"
conditions:
- files~=(README.md|READEME_cn.md)
actions:
label:
add: ["README"]
- name: "auto add label=Documentation"
conditions:
- files~=^(docs/|CHANGELOG.md)
actions:
label:
add: ["Documentation"]
- name: "auto add label=CI"
conditions:
- files~=^(.circleci/|ci/|.github/|.travis.yml|.travis|env.sh)
actions:
label:
add: ["CI"]
- name: "auto add label=Installation"
conditions:
- files~=^(tools/|setup.py|setup.cfg|setup_audio.py)
actions:
label:
add: ["Installation"]
- name: "auto add label=Test"
conditions:
- files~=^(tests/)
actions:
label:
add: ["Test"]
- name: "auto add label=mergify"
conditions:
- files~=^.mergify.yml
actions:
label:
add: ["mergify"]
- name: "auto add label=Docker"
conditions:
- files~=^docker/
actions:
label:
add: ["Docker"]
- name: "auto add label=Deployment"
conditions:
- files~=^runtime/
actions:
label:
add: ["Deployment"]
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/mirrors-yapf.git
rev: v0.16.0
hooks:
- id: yapf
files: \.py$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*paddle)^.*$
- id: end-of-file-fixer
files: \.md$
#- id: trailing-whitespace
# files: \.md$
- id: requirements-txt-fixer
exclude: (?=third_party).*$
- id: check-yaml
- id: check-json
- id: pretty-format-json
args:
- --no-sort-keys
- --autofix
- id: check-merge-conflict
# - id: flake8
# aergs:
# - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
# - --builtins=G,request
# - --jobs=1
# exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
rev: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
- id: remove-crlf
files: \.md$
- id: forbid-tabs
files: \.md$
- id: remove-tabs
files: \.md$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .pre-commit-hooks/clang-format.hook -i
language: system
files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
- id: cpplint
name: cpplint
description: Static code analysis of C/C++ files
language: python
files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
exclude: (?=runtime/engine/kaldi|runtime/engine/common/matrix|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$
entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
- repo: https://github.com/asottile/reorder_python_imports
rev: v2.4.0
hooks:
- id: reorder-python-imports
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
================================================
FILE: .pre-commit-hooks/clang-format.hook
================================================
#!/usr/bin/env bash
set -e
readonly VERSION="3.9"
version=$(clang-format -version)
# if ! [[ $version == *"$VERSION"* ]]; then
# echo "clang-format version check failed."
# echo "a version contains '$VERSION' is needed, but get '$version'"
# echo "you can install the right version, and make an soft-link to '\$PATH' env"
# exit -1
# fi
clang-format $@
================================================
FILE: .pre-commit-hooks/copyright-check.hook
================================================
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import io, re
import sys, os
import subprocess
import platform
COPYRIGHT = '''
Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
LANG_COMMENT_MARK = None
NEW_LINE_MARK = None
COPYRIGHT_HEADER = None
if platform.system() == "Windows":
NEW_LINE_MARK = "\r\n"
else:
NEW_LINE_MARK = '\n'
COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
date, err = process.communicate()
date = date.decode("utf-8").rstrip("\n")
COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
def generate_copyright(template, lang='C'):
if lang == 'Python':
LANG_COMMENT_MARK = '#'
else:
LANG_COMMENT_MARK = "//"
lines = template.split(NEW_LINE_MARK)
BLANK = " "
ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
for lino, line in enumerate(lines):
if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
if len(line) == 0:
BLANK = ""
else:
BLANK = " "
ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
return ans + "\n"
def lang_type(filename):
if filename.endswith(".py"):
return "Python"
elif filename.endswith(".h"):
return "C"
elif filename.endswith(".c"):
return "C"
elif filename.endswith(".hpp"):
return "C"
elif filename.endswith(".cc"):
return "C"
elif filename.endswith(".cpp"):
return "C"
elif filename.endswith(".cu"):
return "C"
elif filename.endswith(".cuh"):
return "C"
elif filename.endswith(".go"):
return "C"
elif filename.endswith(".proto"):
return "C"
else:
print("Unsupported filetype %s", filename)
exit(0)
PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
def main(argv=None):
parser = argparse.ArgumentParser(
description='Checker for copyright declaration.')
parser.add_argument('filenames', nargs='*', help='Filenames to check')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
fd = io.open(filename, encoding="utf-8")
first_line = fd.readline()
second_line = fd.readline()
if "COPYRIGHT (C)" in first_line.upper(): continue
if first_line.startswith("#!") or PYTHON_ENCODE.match(
second_line) != None or PYTHON_ENCODE.match(first_line) != None:
continue
original_contents = io.open(filename, encoding="utf-8").read()
new_contents = generate_copyright(
COPYRIGHT, lang_type(filename)) + original_contents
print('Auto Insert Copyright Header {}'.format(filename))
retv = 1
with io.open(filename, 'w') as output_file:
output_file.write(new_contents)
return retv
if __name__ == '__main__':
exit(main())
================================================
FILE: .readthedocs.yml
================================================
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Build documentation with MkDocs
#mkdocs:
# configuration: mkdocs.yml
# Optionally build your docs in additional formats such as PDF
formats: []
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
- method: setuptools
path: .
system_packages: true
================================================
FILE: .style.yapf
================================================
[style]
based_on_style = pep8
column_limit = 80
================================================
FILE: .travis.yml
================================================
language: cpp
cache: ccache
sudo: required
dist: Bionic
services:
- docker
os:
- linux
env:
- JOB=PRE_COMMIT
addons:
apt:
packages:
- git
- python3-pip
- python3-dev
before_install:
- python3 --version
- python3 -m pip --version
- pip3 --version
- sudo pip3 install -U virtualenv pre-commit pip
- docker pull paddlepaddle/paddle:latest
script:
- exit_code=0
- docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c
'cd /py_unittest && bash .travis/precommit.sh && source env.sh && bash .travis/unittest.sh' || exit_code=$(( exit_code | $? ))
exit $exit_code
notifications:
email:
on_success: change
on_failure: always
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include paddlespeech/t2s/exps/*.txt
include paddlespeech/t2s/frontend/*.yaml
================================================
FILE: README.md
================================================
([简体中文](./README_cn.md)|English)
------------------------------------------------------------------------------------
**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models.
**PaddleSpeech** won the [NAACL2022 Best Demo Award](https://2022.naacl.org/blog/best-demo-award/), please check out our paper on [Arxiv](https://arxiv.org/abs/2205.12007).
##### Speech Recognition
| Input Audio |
Recognition Result |

|
I knocked at the door on the ancient side of the building. |

|
我认为跑步最重要的就是给我带来了身体健康。 |
##### Speech Translation (English to Chinese)
| Input Audio |
Translations Result |

|
我 在 这栋 建筑 的 古老 门上 敲门。 |
##### Text-to-Speech
| Input Text |
Synthetic Audio |
| Life was like a box of chocolates, you never know what you're gonna get. |

|
| 早上好,今天是2020/10/29,最低温度是-3°C。 |

|
| 季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 |

|
| 大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 |

|
| 宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 |

|
| 各个国家有各个国家嘅国歌 |

|
For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html).
##### Punctuation Restoration
| Input Text |
Output Text |
| 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 |
今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。 |
### Features
Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at:
- 📦 **Ease of Use**: low barriers to install, [CLI](#quick-start), [Server](#quick-start-server), and [Streaming Server](#quick-start-streaming-server) is available to quick-start your journey.
- 🏆 **Align to the State-of-the-Art**: we provide high-speed and ultra-lightweight models, and also cutting-edge technology.
- 🏆 **Streaming ASR and TTS System**: we provide production ready streaming asr and streaming tts system.
- 💯 **Rule-based Chinese frontend**: our frontend contains Text Normalization and Grapheme-to-Phoneme (G2P, including Polyphone and Tone Sandhi). Moreover, we use self-defined linguistic rules to adapt Chinese context.
- 📦 **Varieties of Functions that Vitalize both Industrial and Academia**:
- 🛎️ *Implementation of critical audio tasks*: this toolkit contains audio functions like Automatic Speech Recognition, Text-to-Speech Synthesis, Speaker Verification, KeyWord Spotting, Audio Classification, and Speech Translation, etc.
- 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details.
- 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
### Recent Update
- 🎉 2025.09.01: Add [Whisper large v3 and turbo model](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/whisper).
- 🤗 2025.08.11: Add [code-switch online model and server demo](./examples/tal_cs/asr1/).
- 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech.
- 🎉 2023.05.18: Add [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), Squeezeformer training for ASR on Aishell.
- 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech.
- ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved.
- 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
- 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
- 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](./examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
- 🎉 2022.11.30: Add [TTS Android Demo](./demos/TTSAndroid).
- 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
of paddlepaddle](https://www.paddlepaddle.org.cn/models).
- 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
- 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
- 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
- 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
- 👑 2022.10.11: Add [Wav2vec2ASR-en](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
- 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and [ERNIE-SAT](https://arxiv.org/abs/2211.03545) in [PaddleSpeech Web Demo](./demos/speech_web).
- ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder.
- ⚡ 2022.08.25: Release TTS [finetune](./examples/other/tts_finetune/tts3) example.
- 🔥 2022.08.22: Add [ERNIE-SAT](https://arxiv.org/abs/2211.03545) models: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat).
- 🔥 2022.08.15: Add [g2pW](https://github.com/GitYCC/g2pW) into TTS Chinese Text Frontend.
- 🔥 2022.08.09: Release [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
- ⚡ 2022.08.03: Add ONNXRuntime infer for TTS CLI.
- 🎉 2022.07.18: Release VITS: [VITS-csmsc](./examples/csmsc/vits)、[VITS-aishell3](./examples/aishell3/vits)、[VITS-VC](./examples/aishell3/vits-vc).
- 🎉 2022.06.22: All TTS models support ONNX format.
- 🍀 2022.06.17: Add [PaddleSpeech Web Demo](./demos/speech_web).
- 👑 2022.05.13: Release [PP-ASR](./docs/source/asr/PPASR.md)、[PP-TTS](./docs/source/tts/PPTTS.md)、[PP-VPR](docs/source/vpr/PPVPR.md).
- 👏🏻 2022.05.06: `PaddleSpeech Streaming Server` is available for `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp` and `Text-to-Speech`.
- 👏🏻 2022.05.06: `PaddleSpeech Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`, `Speaker Verification` and `Punctuation Restoration`.
- 👏🏻 2022.03.28: `PaddleSpeech CLI` is available for `Speaker Verification`.
- 👏🏻 2021.12.10: `PaddleSpeech CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`.
### Community
- Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
## Installation
We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8*.
### **Dependency Introduction**
+ gcc >= 4.8.5
+ paddlepaddle
+ python >= 3.8
+ OS support: Linux(recommend), Windows, Mac OSX
PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version.
```bash
pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
```
You can also specify the version of paddlepaddle or install the develop version.
```bash
# install 2.4.1 version. Note, 2.4.1 is just an example, please follow the minimum dependency of paddlepaddle for your selection
pip install paddlepaddle==2.4.1 -i https://mirror.baidu.com/pypi/simple
# install develop version
pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
```
There are two quick installation methods for PaddleSpeech, one is pip installation, and the other is source code compilation (recommended).
### pip install
```shell
pip install pytest-runner
pip install paddlespeech
```
### source code compilation
```shell
git clone https://github.com/PaddlePaddle/PaddleSpeech.git
cd PaddleSpeech
pip install pytest-runner
pip install .
# If you need to install in editable mode, you need to use --use-pep517. The command is as follows:
# pip install -e . --use-pep517
```
For more installation problems, such as conda environment, librosa-dependent, gcc problems, kaldi installation, etc., you can refer to this [installation document](./docs/source/install.md). If you encounter problems during installation, you can leave a message on [#2150](https://github.com/PaddlePaddle/PaddleSpeech/issues/2150) and find related problems
## Quick Start
Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md) or Python. Change `--input` to test your own audio/text and support 16k wav format audio.
**You can also quickly experience it in AI Studio 👉🏻 [PaddleSpeech API Demo](https://aistudio.baidu.com/aistudio/projectdetail/4353348?sUid=2470186&shared=1&ts=1660876445786)**
Test audio sample download
```shell
wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav
```
### Automatic Speech Recognition
(Click to expand)Open Source Speech Recognition
**command line experience**
```shell
paddlespeech asr --lang zh --input zh.wav
```
**Python API experience**
```python
>>> from paddlespeech.cli.asr.infer import ASRExecutor
>>> asr = ASRExecutor()
>>> result = asr(audio_file="zh.wav")
>>> print(result)
我认为跑步最重要的就是给我带来了身体健康
```
### Text-to-Speech
Open Source Speech Synthesis
Output 24k sample rate wav format audio
**command line experience**
```shell
paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```
**Python API experience**
```python
>>> from paddlespeech.cli.tts.infer import TTSExecutor
>>> tts = TTSExecutor()
>>> tts(text="今天天气十分不错。", output="output.wav")
```
- You can experience in [Huggingface Spaces](https://huggingface.co/spaces) [TTS Demo](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS)
### Audio Classification
An open-domain sound classification tool
Sound classification model based on 527 categories of AudioSet dataset
**command line experience**
```shell
paddlespeech cls --input zh.wav
```
**Python API experience**
```python
>>> from paddlespeech.cli.cls.infer import CLSExecutor
>>> cls = CLSExecutor()
>>> result = cls(audio_file="zh.wav")
>>> print(result)
Speech 0.9027186632156372
```
### Voiceprint Extraction
Industrial-grade voiceprint extraction tool
**command line experience**
```shell
paddlespeech vector --task spk --input zh.wav
```
**Python API experience**
```python
>>> from paddlespeech.cli.vector import VectorExecutor
>>> vec = VectorExecutor()
>>> result = vec(audio_file="zh.wav")
>>> print(result) # 187维向量
[ -0.19083306 9.474295 -14.122263 -2.0916545 0.04848729
4.9295826 1.4780062 0.3733844 10.695862 3.2697146
-4.48199 -0.6617882 -9.170393 -11.1568775 -1.2358263 ...]
```
### Punctuation Restoration
Quick recovery of text punctuation, works with ASR models
**command line experience**
```shell
paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
```
**Python API experience**
```python
>>> from paddlespeech.cli.text.infer import TextExecutor
>>> text_punc = TextExecutor()
>>> result = text_punc(text="今天的天气真不错啊你下午有空吗我想约你一起去吃饭")
今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
```
### Speech Translation
End-to-end English to Chinese Speech Translation Tool
Use pre-compiled kaldi related tools, only support experience in Ubuntu system
**command line experience**
```shell
paddlespeech st --input en.wav
```
**Python API experience**
```python
>>> from paddlespeech.cli.st.infer import STExecutor
>>> st = STExecutor()
>>> result = st(audio_file="en.wav")
['我 在 这栋 建筑 的 古老 门上 敲门 。']
```
## Quick Start Server
Developers can have a try of our speech server with [PaddleSpeech Server Command Line](./paddlespeech/server/README.md).
**You can try it quickly in AI Studio (recommend): [SpeechServer](https://aistudio.baidu.com/aistudio/projectdetail/4354592?sUid=2470186&shared=1&ts=1660877827034)**
**Start server**
```shell
paddlespeech_server start --config_file ./demos/speech_server/conf/application.yaml
```
**Access Speech Recognition Services**
```shell
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
**Access Text to Speech Services**
```shell
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
```
**Access Audio Classification Services**
```shell
paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
```
For more information about server command lines, please see: [speech server demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
## Quick Start Streaming Server
Developers can have a try of [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md) server.
**Start Streaming Speech Recognition Server**
```
paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml
```
**Access Streaming Speech Recognition Services**
```
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
**Start Streaming Text to Speech Server**
```
paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml
```
**Access Streaming Text to Speech Services**
```
paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
```
For more information please see: [streaming asr](./demos/streaming_asr_server/README.md) and [streaming tts](./demos/streaming_tts_server/README.md)
## Model List
PaddleSpeech supports a series of most popular models. They are summarized in [released models](./docs/source/released_model.md) and attached with available pretrained models.
**Speech-to-Text** contains *Acoustic Model*, *Language Model*, and *Speech Translation*, with the following details:
**Text-to-Speech** in PaddleSpeech mainly contains three modules: *Text Frontend*, *Acoustic Model* and *Vocoder*. Acoustic Model and Vocoder models are listed as follow:
**Audio Classification**
| Task |
Dataset |
Model Type |
Example |
| Audio Classification |
ESC-50 |
PANN |
pann-esc50
|
**Keyword Spotting**
| Task |
Dataset |
Model Type |
Example |
| Keyword Spotting |
hey-snips |
MDTC |
mdtc-hey-snips
|
**Speaker Verification**
**Speaker Diarization**
| Task |
Dataset |
Model Type |
Example |
| Speaker Diarization |
AMI |
ECAPA-TDNN + AHC / SC |
ecapa-tdnn-ami
|
**Punctuation Restoration**
| Task |
Dataset |
Model Type |
Example |
| Punctuation Restoration |
IWLST2012_zh |
Ernie Linear |
iwslt2012-punc0
|
## Documents
Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
- [Installation](./docs/source/install.md)
- [Quick Start](#quickstart)
- [Some Demos](./demos/README.md)
- Tutorials
- [Automatic Speech Recognition](./docs/source/asr/quick_start.md)
- [Introduction](./docs/source/asr/models_introduction.md)
- [Data Preparation](./docs/source/asr/data_preparation.md)
- [Ngram LM](./docs/source/asr/ngram_lm.md)
- [Text-to-Speech](./docs/source/tts/quick_start.md)
- [Introduction](./docs/source/tts/models_introduction.md)
- [Advanced Usage](./docs/source/tts/advanced_usage.md)
- [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
- [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- Speaker Verification
- [Audio Searching](./demos/audio_searching/README.md)
- [Speaker Verification](./demos/speaker_verification/README.md)
- [Audio Classification](./demos/audio_tagging/README.md)
- [Speech Translation](./demos/speech_translation/README.md)
- [Speech Server](./demos/speech_server/README.md)
- [Released Models](./docs/source/released_model.md)
- [Speech-to-Text](#SpeechToText)
- [Text-to-Speech](#TextToSpeech)
- [Audio Classification](#AudioClassification)
- [Speaker Verification](#SpeakerVerification)
- [Speaker Diarization](#SpeakerDiarization)
- [Punctuation Restoration](#PunctuationRestoration)
- [Community](#Community)
- [Welcome to contribute](#contribution)
- [License](#License)
The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components.
## ⭐ Examples
- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.**
- [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
## Citation
To cite PaddleSpeech for research, please use the following format.
```text
@inproceedings{zhang2022paddlespeech,
title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit},
author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang},
booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations},
year = {2022},
publisher = {Association for Computational Linguistics},
}
@InProceedings{pmlr-v162-bai22d,
title = {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing},
author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang},
booktitle = {Proceedings of the 39th International Conference on Machine Learning},
pages = {1399--1411},
year = {2022},
volume = {162},
series = {Proceedings of Machine Learning Research},
month = {17--23 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v162/bai22d/bai22d.pdf},
url = {https://proceedings.mlr.press/v162/bai22d.html},
}
@inproceedings{zheng2021fused,
title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
booktitle={International Conference on Machine Learning},
pages={12736--12746},
year={2021},
organization={PMLR}
}
```
## Contribute to PaddleSpeech
You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you are willing to contribute to this project!
### Contributors
## Acknowledgement
- Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples.
- Many thanks to [david-95](https://github.com/david-95) for fixing multi-punctuation bug、contributing to multiple program and data, and adding [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
- Many thanks to [BarryKCL](https://github.com/BarryKCL) for improving TTS Chinses Frontend based on [G2PW](https://github.com/GitYCC/g2pW).
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
- Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
- Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.
- Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model.
- Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents.
- Many thanks to [awmmmm](https://github.com/awmmmm) for contributing fastspeech2 aishell3 conformer pretrained model.
- Many thanks to [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) for developing a dubbing tool with GUI based on PaddleSpeech TTS model.
- Many thanks to [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) for developing a GUI tool based on PaddleSpeech TTS and code for making datasets from videos based on PaddleSpeech ASR.
- Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech.
- Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR.
- Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services.
- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Visual Studio.
Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
- Many thanks to [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) for converting audio to text based on FastAPI and PaddleSpeech.
- Many thanks to [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) for QQ bot based on PaddleSpeech TTS.
## License
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Stargazers over time
[](https://starchart.cc/PaddlePaddle/PaddleSpeech)
================================================
FILE: README_cn.md
================================================
(简体中文|[English](./README.md))
------------------------------------------------------------------------------------
**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,包含大量基于深度学习前沿和有影响力的模型,一些典型的应用示例如下:
**PaddleSpeech** 荣获 [NAACL2022 Best Demo Award](https://2022.naacl.org/blog/best-demo-award/), 请访问 [Arxiv](https://arxiv.org/abs/2205.12007) 论文。
### 效果展示
##### 语音识别
| 输入音频 |
识别结果 |

|
I knocked at the door on the ancient side of the building. |

|
我认为跑步最重要的就是给我带来了身体健康。 |
##### 语音翻译 (英译中)
| 输入音频 |
翻译结果 |

|
我 在 这栋 建筑 的 古老 门上 敲门。 |
##### 语音合成
| 输入文本 |
合成音频 |
| Life was like a box of chocolates, you never know what you're gonna get. |

|
| 早上好,今天是2020/10/29,最低温度是-3°C。 |

|
| 季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 |

|
| 大家好,我是 parrot 虚拟老师,我们来读一首诗,我与春风皆过客,I and the spring breeze are passing by,你携秋水揽星河,you take the autumn water to take the galaxy。 |

|
| 宜家唔系事必要你讲,但系你所讲嘅说话将会变成呈堂证供。 |

|
| 各个国家有各个国家嘅国歌 |

|
更多合成音频,可以参考 [PaddleSpeech 语音合成音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)。
##### 标点恢复
| 输入文本 |
输出文本 |
| 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 |
今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。 |
### 特性
本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括
- 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。
- 🏆 **对标 SoTA**: 提供了高速、轻量级模型,且借鉴了最前沿的技术。
- 🏆 **流式 ASR 和 TTS 系统**:工业级的端到端流式识别、流式合成系统。
- 💯 **基于规则的中文前端**: 我们的前端包含文本正则化和字音转换(G2P)。此外,我们使用自定义语言规则来适应中文语境。
- **多种工业界以及学术界主流功能支持**:
- 🛎️ 典型音频任务: 本工具包提供了音频任务如音频分类、语音翻译、自动语音识别、文本转语音、语音合成、声纹识别、KWS等任务的实现。
- 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。
- 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。
### 近期更新
- 🎉 2025.09.01: 新增 [Whisper large v3 与 turbo 模型](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/whisper).
- 🤗 2025.08.11: 新增 [流式中英混合 tal_cs 识别模型](./examples/tal_cs/asr1/).
- 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调,使用LibriSpeech数据集
- 🎉 2023.05.18: 新增 [Squeezeformer](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr1), 使用Squeezeformer进行训练,使用Aishell数据集
- 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调,使用LibriSpeech数据集
- ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。
- 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例,包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5),效果持续优化中。
- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。
- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
- 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
- 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
- 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
- 🎉 2022.12.02: 新增[端到端韵律预测全流程](./examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
- 🎉 2022.11.30: 新增 [TTS Android 部署示例](./demos/TTSAndroid)。
- 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验!
- 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), 支持多种语言的识别与翻译。
- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](./demos/speech_ssl), 支持 ASR 和特征提取。
- 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。
- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](./speechx/examples/u2pp_ol/wenetspeech)。
- 👑 2022.11.01: [中英文混合 TTS](./examples/zh_en_tts/tts3) 新增 [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) 模块。
- 🔥 2022.10.26: TTS 新增[韵律预测](./develop/examples/other/rhy)功能。
- 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
- 👑 2022.10.11: 新增 [Wav2vec2ASR-en](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。
- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 到 [PaddleSpeech 网页应用](./demos/speech_web)。
- ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。
- ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。
- 🔥 2022.08.22: 新增 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 模型: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat)。
- 🔥 2022.08.15: 将 [g2pW](https://github.com/GitYCC/g2pW) 引入 TTS 中文文本前端。
- 🔥 2022.08.09: 发布[中英文混合 TTS](./examples/zh_en_tts/tts3)。
- ⚡ 2022.08.03: TTS CLI 新增 ONNXRuntime 推理方式。
- 🎉 2022.07.18: 发布 VITS 模型: [VITS-csmsc](./examples/csmsc/vits)、[VITS-aishell3](./examples/aishell3/vits)、[VITS-VC](./examples/aishell3/vits-vc)。
- 🎉 2022.06.22: 所有 TTS 模型支持了 ONNX 格式。
- 🍀 2022.06.17: 新增 [PaddleSpeech 网页应用](./demos/speech_web)。
- 👑 2022.05.13: PaddleSpeech 发布 [PP-ASR](./docs/source/asr/PPASR_cn.md) 流式语音识别系统、[PP-TTS](./docs/source/tts/PPTTS_cn.md) 流式语音合成系统、[PP-VPR](docs/source/vpr/PPVPR_cn.md) 全链路声纹识别系统
- 👏🏻 2022.05.06: PaddleSpeech Streaming Server 上线!覆盖了语音识别(标点恢复、时间戳)和语音合成。
- 👏🏻 2022.05.06: PaddleSpeech Server 上线!覆盖了声音分类、语音识别、语音合成、声纹识别,标点恢复。
- 👏🏻 2022.03.28: PaddleSpeech CLI 覆盖声音分类、语音识别、语音翻译(英译中)、语音合成和声纹验证。
- 👏🏻 2021.12.10: PaddleSpeech CLI 支持语音分类, 语音识别, 语音翻译(英译中)和语音合成。
### 🔥 加入技术交流群获取入群福利
- 3 日直播课链接: 深度解读 【一句话语音合成】【小样本语音合成】【定制化语音识别】语音交互技术
- 20G 学习大礼包:视频课程、前沿论文与学习资料
微信扫描二维码关注公众号,点击“马上报名”填写问卷加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
## 安装
我们强烈建议用户在 **Linux** 环境下,*3.8* 以上版本的 *python* 上安装 PaddleSpeech。
### 相关依赖
+ gcc >= 4.8.5
+ paddlepaddle
+ python >= 3.8
+ linux(推荐), mac, windows
PaddleSpeech 依赖于 paddlepaddle,安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/),根据自己机器的情况进行选择。这里给出 cpu 版本示例,其它版本大家可以根据自己机器的情况进行安装。
```shell
pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
```
你也可以安装指定版本的paddlepaddle,或者安装 develop 版本。
```bash
# 安装2.4.1版本. 注意:2.4.1只是一个示例,请按照对paddlepaddle的最小依赖进行选择。
pip install paddlepaddle==2.4.1 -i https://mirror.baidu.com/pypi/simple
# 安装 develop 版本
pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
```
PaddleSpeech 快速安装方式有两种,一种是 pip 安装,一种是源码编译(推荐)。
### pip 安装
```shell
pip install pytest-runner
pip install paddlespeech
```
### 源码编译
```shell
git clone https://github.com/PaddlePaddle/PaddleSpeech.git
cd PaddleSpeech
pip install pytest-runner
pip install .
# 如果需要在可编辑模式下安装,需要使用 --use-pep517,命令如下
# pip install -e . --use-pep517
```
更多关于安装问题,如 conda 环境,librosa 依赖的系统库,gcc 环境问题,kaldi 安装等,可以参考这篇[安装文档](docs/source/install_cn.md),如安装上遇到问题可以在 [#2150](https://github.com/PaddlePaddle/PaddleSpeech/issues/2150) 上留言以及查找相关问题
## 快速开始
安装完成后,开发者可以通过命令行或者 Python 快速开始,命令行模式下改变 `--input` 可以尝试用自己的音频或文本测试,支持 16k wav 格式音频。
你也可以在 `aistudio` 中快速体验 👉🏻[一键预测,快速上手 Speech 开发任务](https://aistudio.baidu.com/aistudio/projectdetail/4353348?sUid=2470186&shared=1&ts=1660878142250)。
测试音频示例下载
```shell
wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.cdn.bcebos.com/PaddleAudio/en.wav
```
### 语音识别
(点击可展开)开源中文语音识别
命令行一键体验
```shell
paddlespeech asr --lang zh --input zh.wav
```
Python API 一键预测
```python
>>> from paddlespeech.cli.asr.infer import ASRExecutor
>>> asr = ASRExecutor()
>>> result = asr(audio_file="zh.wav")
>>> print(result)
我认为跑步最重要的就是给我带来了身体健康
```
### 语音合成
开源中文语音合成
输出 24k 采样率wav格式音频
命令行一键体验
```shell
paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```
Python API 一键预测
```python
>>> from paddlespeech.cli.tts.infer import TTSExecutor
>>> tts = TTSExecutor()
>>> tts(text="今天天气十分不错。", output="output.wav")
```
- 语音合成的 web demo 已经集成进了 [Huggingface Spaces](https://huggingface.co/spaces). 请参考: [TTS Demo](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS)
### 声音分类
适配多场景的开放领域声音分类工具
基于 AudioSet 数据集 527 个类别的声音分类模型
命令行一键体验
```shell
paddlespeech cls --input zh.wav
```
python API 一键预测
```python
>>> from paddlespeech.cli.cls.infer import CLSExecutor
>>> cls = CLSExecutor()
>>> result = cls(audio_file="zh.wav")
>>> print(result)
Speech 0.9027186632156372
```
### 声纹提取
工业级声纹提取工具
命令行一键体验
```shell
paddlespeech vector --task spk --input zh.wav
```
Python API 一键预测
```python
>>> from paddlespeech.cli.vector import VectorExecutor
>>> vec = VectorExecutor()
>>> result = vec(audio_file="zh.wav")
>>> print(result) # 187维向量
[ -0.19083306 9.474295 -14.122263 -2.0916545 0.04848729
4.9295826 1.4780062 0.3733844 10.695862 3.2697146
-4.48199 -0.6617882 -9.170393 -11.1568775 -1.2358263 ...]
```
### 标点恢复
一键恢复文本标点,可与ASR模型配合使用
命令行一键体验
```shell
paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
```
Python API 一键预测
```python
>>> from paddlespeech.cli.text.infer import TextExecutor
>>> text_punc = TextExecutor()
>>> result = text_punc(text="今天的天气真不错啊你下午有空吗我想约你一起去吃饭")
今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
```
### 语音翻译
端到端英译中语音翻译工具
使用预编译的 kaldi 相关工具,只支持在 Ubuntu 系统中体验
命令行一键体验
```shell
paddlespeech st --input en.wav
```
python API 一键预测
```python
>>> from paddlespeech.cli.st.infer import STExecutor
>>> st = STExecutor()
>>> result = st(audio_file="en.wav")
['我 在 这栋 建筑 的 古老 门上 敲门 。']
```
## 快速使用服务
安装完成后,开发者可以通过命令行一键启动语音识别,语音合成,音频分类等多种服务。
你可以在 AI Studio 中快速体验:[SpeechServer 一键部署](https://aistudio.baidu.com/aistudio/projectdetail/4354592?sUid=2470186&shared=1&ts=1660878208266)
**启动服务**
```shell
paddlespeech_server start --config_file ./demos/speech_server/conf/application.yaml
```
**访问语音识别服务**
```shell
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
**访问语音合成服务**
```shell
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
```
**访问音频分类服务**
```shell
paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
```
更多服务相关的命令行使用信息,请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
## 快速使用流式服务
开发者可以尝试 [流式 ASR](./demos/streaming_asr_server/README.md) 和 [流式 TTS](./demos/streaming_tts_server/README.md) 服务.
**启动流式 ASR 服务**
```
paddlespeech_server start --config_file ./demos/streaming_asr_server/conf/application.yaml
```
**访问流式 ASR 服务**
```
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
**启动流式 TTS 服务**
```
paddlespeech_server start --config_file ./demos/streaming_tts_server/conf/tts_online_application.yaml
```
**访问流式 TTS 服务**
```
paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --protocol http --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
```
更多信息参看: [流式 ASR](./demos/streaming_asr_server/README.md) 和 [流式 TTS](./demos/streaming_tts_server/README.md)
## 模型列表
PaddleSpeech 支持很多主流的模型,并提供了预训练模型,详情请见[模型列表](./docs/source/released_model.md)。
PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识别语言模型和语音翻译, 详情如下:
PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声学模型和声码器。声学模型和声码器模型如下:
**声音分类**
**语音唤醒**
**声纹识别**
**说话人日志**
**标点恢复**
## 教程文档
对于 PaddleSpeech 的所关注的任务,以下指南有助于帮助开发者快速入门,了解语音相关核心思想。
- [下载安装](./docs/source/install_cn.md)
- [快速开始](#快速开始)
- Notebook基础教程
- [声音分类](./docs/tutorial/cls/cls_tutorial.ipynb)
- [语音识别](./docs/tutorial/asr/tutorial_transformer.ipynb)
- [语音翻译](./docs/tutorial/st/st_tutorial.ipynb)
- [声音合成](./docs/tutorial/tts/tts_tutorial.ipynb)
- [示例Demo](./demos/README.md)
- 进阶文档
- [语音识别自定义训练](./docs/source/asr/quick_start.md)
- [简介](./docs/source/asr/models_introduction.md)
- [数据准备](./docs/source/asr/data_preparation.md)
- [Ngram 语言模型](./docs/source/asr/ngram_lm.md)
- [语音合成自定义训练](./docs/source/tts/quick_start.md)
- [简介](./docs/source/tts/models_introduction.md)
- [进阶用法](./docs/source/tts/advanced_usage.md)
- [中文文本前端](./docs/source/tts/zh_text_frontend.md)
- [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- 声纹识别
- [声纹识别](./demos/speaker_verification/README_cn.md)
- [音频检索](./demos/audio_searching/README_cn.md)
- [声音分类](./demos/audio_tagging/README_cn.md)
- [语音翻译](./demos/speech_translation/README_cn.md)
- [服务化部署](./demos/speech_server/README_cn.md)
- [模型列表](#模型列表)
- [语音识别](#语音识别模型)
- [语音合成](#语音合成模型)
- [声音分类](#声音分类模型)
- [声纹识别](#声纹识别模型)
- [说话人日志](#说话人日志模型)
- [标点恢复](#标点恢复模型)
- [技术交流群](#技术交流群)
- [欢迎贡献](#欢迎贡献)
- [License](#License)
语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。
## ⭐ 应用案例
- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。**

- [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html)
- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。**
## 引用
要引用 PaddleSpeech 进行研究,请使用以下格式进行引用。
```text
@InProceedings{pmlr-v162-bai22d,
title = {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing},
author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang},
booktitle = {Proceedings of the 39th International Conference on Machine Learning},
pages = {1399--1411},
year = {2022},
volume = {162},
series = {Proceedings of Machine Learning Research},
month = {17--23 Jul},
publisher = {PMLR},
pdf = {https://proceedings.mlr.press/v162/bai22d/bai22d.pdf},
url = {https://proceedings.mlr.press/v162/bai22d.html},
}
@inproceedings{zhang2022paddlespeech,
title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit},
author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang},
booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations},
year = {2022},
publisher = {Association for Computational Linguistics},
}
@inproceedings{zheng2021fused,
title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
booktitle={International Conference on Machine Learning},
pages={12736--12746},
year={2021},
organization={PMLR}
}
```
## 参与 PaddleSpeech 的开发
热烈欢迎您在 [Discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) 中提交问题,并在 [Issues](https://github.com/PaddlePaddle/PaddleSpeech/issues) 中指出发现的 bug。此外,我们非常希望您参与到 PaddleSpeech 的开发中!
### 贡献者
## 致谢
- 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。
- 非常感谢 [david-95](https://github.com/david-95) 修复 TTS 句尾多标点符号出错的问题,贡献补充多条程序和数据。为 TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
- 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。
- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。
- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。
- 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。
- 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
- 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
- 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。
- 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。
- 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。
- 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。
- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio,基于python3.9的预编译依赖安装包。
- 非常感谢 [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) 利用 FastAPI 实现 PaddleSpeech 语音转文字,文件上传、分割、转换进度显示、后台更新任务并以 csv 格式输出。
- 非常感谢 [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) 基于 PaddleSpeech TTS 的 QQ Bot 项目。
此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。
## License
PaddleSpeech 在 [Apache-2.0 许可](./LICENSE) 下提供。
## Stargazers over time
[](https://starchart.cc/PaddlePaddle/PaddleSpeech)
================================================
FILE: audio/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
# Use compiler ID "AppleClang" instead of "Clang" for XCode.
# Not setting this sometimes makes XCode C compiler gets detected as "Clang",
# even when the C++ one is detected as "AppleClang".
cmake_policy(SET CMP0010 NEW)
cmake_policy(SET CMP0025 NEW)
# Suppress warning flags in default MSVC configuration. It's not
# mandatory that we do this (and we don't if cmake is old), but it's
# nice when it's possible, and it's possible on our Windows configs.
if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
cmake_policy(SET CMP0092 NEW)
endif()
project(paddleaudio)
# check and set CMAKE_CXX_STANDARD
string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
if(env_cxx_standard GREATER -1)
message(
WARNING "C++ standard version definition detected in environment variable."
"paddleaudio requires -std=c++14. Please remove -std=c++ settings in your environment.")
endif()
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_C_STANDARD 11)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_VERBOSE_MAKEFILE ON)
# Options
option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_MAD "Enable libmad" ON)
option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
# cmake
set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
# fc_patch dir
set(FETCHCONTENT_QUIET off)
get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
set(FETCHCONTENT_BASE_DIR ${fc_patch})
set(THIRD_PARTY_PATH ${fc_patch})
set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
include(cmake/pybind.cmake)
include_directories(${PYTHON_INCLUDE_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/)
# packages
find_package(Python3 COMPONENTS Interpreter Development)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -O0 -Wall -g")
add_subdirectory(paddleaudio)
# Summary
include(cmake/summary.cmake)
onnx_print_configuration_summary()
================================================
FILE: audio/README.md
================================================
# PaddleAudio
安装方式: pip install paddleaudio
目前支持的平台:Linux, Mac, Windows
## Environment
## Build wheel
cmd: python setup.py bdist_wheel
Linux test build whl environment:
* os - Ubuntu 16.04.7 LTS
* gcc/g++ - 8.2.0
* cmake - 3.18.0 (need install)
MAC:test build whl environment:
* os
* gcc/g++ 12.2.0
* cpu Intel Xeon E5 x86_64
Windows:
not support paddleaudio C++ extension lib (sox io, kaldi native fbank)
================================================
FILE: audio/cmake/FindGFortranLibs.cmake
================================================
#.rst:
# FindGFortranLibs
# --------
# https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
# https://enccs.github.io/cmake-workshop/cxx-fortran/
#
# Find gcc Fortran compiler & library paths
#
# The module defines the following variables:
#
# ::
#
#
# GFORTRANLIBS_FOUND - true if system has gfortran
# LIBGFORTRAN_LIBRARIES - path to libgfortran
# LIBQUADMATH_LIBRARIES - path to libquadmath
# GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
# GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
# LIBGOMP_LIBRARIES - path to libgomp
# LIBGOMP_INCLUDE_DIR - directory containing omp.h header
# GFORTRAN_VERSION_STRING - version of gfortran found
#
set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Looking for gfortran related libraries...")
endif()
enable_language(Fortran)
if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
# Basically, call "gfortran -v" to dump compiler info to the string
# GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
# For debugging
message(STATUS "'gfortran -v' returned:")
message(STATUS "${GFORTRAN_VERBOSE_STR}")
# Detect gfortran version
string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
unset(GFORTRAN_VER_STR)
set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
set(REPLACE_REGEX "([^\t\n ]+)")
# Find architecture for compiler
string(REGEX MATCH "Target: [^\t\n ]+"
GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
unset(GFORTRAN_ARCH_STR)
# Find install prefix, if it exists; if not, use default
string(REGEX MATCH "--prefix=[^\t\n ]+[\t\n ]+"
GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
if(NOT GFORTRAN_PREFIX_STR)
message(STATUS "Detected default gfortran prefix")
set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
else()
string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
endif()
message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
unset(GFORTRAN_PREFIX_STR)
# Find install exec-prefix, if it exists; if not, use default
string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
if(NOT GFORTRAN_EXEC_PREFIX_STR)
message(STATUS "Detected default gfortran exec-prefix")
set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
else()
string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
endif()
message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
UNSET(GFORTRAN_EXEC_PREFIX_STR)
# Find library directory and include directory, if library directory specified
string(REGEX MATCH "--libdir=[^\t\n ]+"
GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
if(NOT GFORTRAN_LIB_DIR_STR)
message(STATUS "Found --libdir flag -- not found")
message(STATUS "Using default gfortran library & include directory paths")
string(STRIP ${GFORTRAN_PREFIX_DIR} TMPLIBDIR)
set(GFORTRAN_LIBRARIES_DIR "${TMPLIBDIR}/lib64")
set(GFORTRAN_INCLUDE_DIR "${TMPLIBDIR}/include")
else()
message(STATUS "Found --libdir flag -- yes")
string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
endif()
message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
unset(GFORTRAN_LIB_DIR_STR)
# There are lots of other build options for gcc & gfortran. For now, the
# options implemented above should cover a lot of common use cases.
# Clean up be deleting the output string from "gfortran -v"
unset(GFORTRAN_VERBOSE_STR)
# Find paths for libgfortran, libquadmath, libgomp
# libgomp needed for OpenMP support without Clang
find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
HINTS ${GFORTRAN_LIBRARIES_DIR})
find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
HINTS ${GFORTRAN_LIBRARIES_DIR})
find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
HINTS ${GFORTRAN_LIBRARIES_DIR})
# Find OpenMP headers
find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
else()
message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
endif()
include(FindPackageHandleStandardArgs)
# Required: libgfortran, libquadmath, path for gfortran libraries
# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
find_package_handle_standard_args(GFortranLibs
REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
VERSION_VAR GFORTRAN_VERSION_STRING)
if(GFORTRANLIBS_FOUND)
message(STATUS "Looking for gfortran libraries -- found")
message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
else()
message(STATUS "Looking for gfortran libraries -- not found")
endif()
mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
# FindGFortranLIBS.cmake ends here
message(STATUS LIBGFORTRAN_LIBRARIES= ${LIBGFORTRAN_LIBRARIES})
message(STATUS LIBQUADMATH_LIBRARIES= ${LIBQUADMATH_LIBRARIES})
message(STATUS LIBGOMP_LIBRARIES= ${LIBGOMP_LIBRARIES})
message(STATUS LIBGOMP_INCLUDE_DIR= ${LIBGOMP_INCLUDE_DIR})
message(STATUS GFORTRAN_LIBRARIES_DIR= ${GFORTRAN_LIBRARIES_DIR})
message(STATUS GFORTRAN_INCLUDE_DIR= ${GFORTRAN_INCLUDE_DIR})
================================================
FILE: audio/cmake/external/openblas.cmake
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(CBLAS_PREFIX_DIR ${THIRD_PARTY_PATH}/openblas)
set(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
set(CBLAS_REPOSITORY https://github.com/xianyi/OpenBLAS.git)
set(CBLAS_TAG v0.3.10)
if(NOT WIN32)
set(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
set(CBLAS_INC_DIR
"${CBLAS_INSTALL_DIR}/include"
CACHE PATH "openblas include directory." FORCE)
set(OPENBLAS_CC
"${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
if(APPLE)
set(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
endif()
set(OPTIONAL_ARGS "")
set(COMMON_ARGS "")
if(APPLE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
set(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
endif()
set(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1)
endif()
ExternalProject_Add(
OPENBLAS
URL "https://paddleaudio.bj.bcebos.com/build/OpenBLAS-0.3.10.zip"
GIT_SHALLOW YES
DOWNLOAD_DIR ${CBLAS_PREFIX_DIR}
SOURCE_DIR ${CBLAS_PREFIX_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1
BUILD_COMMAND make -j${NPROC} ${COMMON_ARGS} ${OPTIONAL_ARGS}
INSTALL_COMMAND make install PREFIX=
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
add_library(openblas STATIC IMPORTED)
add_dependencies(openblas OPENBLAS)
set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a)
link_directories(${OpenBLAS_INSTALL_PREFIX}/lib)
include_directories(${OpenBLAS_INSTALL_PREFIX}/include)
set(OPENBLAS_LIBRARIES
${OpenBLAS_INSTALL_PREFIX}/lib/libopenblas.a
)
add_library(libopenblas INTERFACE)
add_dependencies(libopenblas openblas)
target_include_directories(libopenblas INTERFACE ${OpenBLAS_INSTALL_PREFIX}/include/openblas)
target_link_libraries(libopenblas INTERFACE ${OPENBLAS_LIBRARIES})
else()
set(CBLAS_LIBRARIES
"${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
set(CBLAS_INC_DIR
"${CBLAS_INSTALL_DIR}/include/openblas"
CACHE PATH "openblas include directory." FORCE)
ExternalProject_Add(
extern_openblas
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY ${CBLAS_REPOSITORY}
GIT_TAG ${CBLAS_TAG}
PREFIX ${CBLAS_PREFIX_DIR}
INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 0
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_C_COMPILER=clang-cl
-DCMAKE_CXX_COMPILER=clang-cl
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR}
-DCMAKE_BUILD_TYPE=Release #${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_MT=mt
-DUSE_THREAD=OFF
-DBUILD_WITHOUT_LAPACK=NO
-DCMAKE_Fortran_COMPILER=flang
-DNOFORTRAN=0
-DDYNAMIC_ARCH=ON
#${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR}
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=Release #${THIRD_PARTY_BUILD_TYPE}
# ninja need to know where openblas.lib comes from
BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
set(OPENBLAS_SHARED_LIB
${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
add_library(openblas INTERFACE)
add_dependencies(openblas extern_openblas)
include_directories(${CBLAS_INC_DIR})
link_libraries(${CBLAS_LIBRARIES})
endif()
================================================
FILE: audio/cmake/pybind.cmake
================================================
#the pybind11 is from:https://github.com/pybind/pybind11
# Copyright (c) 2016 Wenzel Jakob , All rights reserved.
SET(PYBIND_ZIP "v2.10.0.zip")
SET(LOCAL_PYBIND_ZIP ${FETCHCONTENT_BASE_DIR}/${PYBIND_ZIP})
SET(PYBIND_SRC ${FETCHCONTENT_BASE_DIR}/pybind11)
SET(DOWNLOAD_URL "https://paddleaudio.bj.bcebos.com/build/v2.10.0.zip")
SET(PYBIND_TIMEOUT 600 CACHE STRING "Timeout in seconds when downloading pybind.")
IF(NOT EXISTS ${LOCAL_PYBIND_ZIP})
FILE(DOWNLOAD ${DOWNLOAD_URL}
${LOCAL_PYBIND_ZIP}
TIMEOUT ${PYBIND_TIMEOUT}
STATUS ERR
SHOW_PROGRESS
)
IF(ERR EQUAL 0)
MESSAGE(STATUS "download pybind success")
ELSE()
MESSAGE(FATAL_ERROR "download pybind fail")
ENDIF()
ENDIF()
IF(NOT EXISTS ${PYBIND_SRC})
EXECUTE_PROCESS(
COMMAND ${CMAKE_COMMAND} -E tar xfz ${LOCAL_PYBIND_ZIP}
WORKING_DIRECTORY ${FETCHCONTENT_BASE_DIR}
RESULT_VARIABLE tar_result
)
file(RENAME ${FETCHCONTENT_BASE_DIR}/pybind11-2.10.0 ${PYBIND_SRC})
IF (tar_result MATCHES 0)
MESSAGE(STATUS "unzip pybind success")
ELSE()
MESSAGE(FATAL_ERROR "unzip pybind fail")
ENDIF()
ENDIF()
include_directories(${PYBIND_SRC}/include)
================================================
FILE: audio/cmake/summary.cmake
================================================
# SPDX-License-Identifier: Apache-2.0
# Prints accumulated ONNX configuration summary
function (onnx_print_configuration_summary)
message(STATUS "")
message(STATUS "******** Summary ********")
message(STATUS " CMake version : ${CMAKE_VERSION}")
message(STATUS " CMake command : ${CMAKE_COMMAND}")
message(STATUS " System : ${CMAKE_SYSTEM_NAME}")
message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}")
message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}")
message(STATUS " Build type : ${CMAKE_BUILD_TYPE}")
get_directory_property(tmp DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
message(STATUS " Compile definitions : ${tmp}")
message(STATUS " CMAKE_PREFIX_PATH : ${CMAKE_PREFIX_PATH}")
message(STATUS " CMAKE_INSTALL_PREFIX : ${CMAKE_INSTALL_PREFIX}")
message(STATUS " CMAKE_MODULE_PATH : ${CMAKE_MODULE_PATH}")
message(STATUS "")
message(STATUS " ONNX version : ${ONNX_VERSION}")
message(STATUS " ONNX NAMESPACE : ${ONNX_NAMESPACE}")
message(STATUS " ONNX_USE_LITE_PROTO : ${ONNX_USE_LITE_PROTO}")
message(STATUS " USE_PROTOBUF_SHARED_LIBS : ${ONNX_USE_PROTOBUF_SHARED_LIBS}")
message(STATUS " Protobuf_USE_STATIC_LIBS : ${Protobuf_USE_STATIC_LIBS}")
message(STATUS " ONNX_DISABLE_EXCEPTIONS : ${ONNX_DISABLE_EXCEPTIONS}")
message(STATUS " ONNX_WERROR : ${ONNX_WERROR}")
message(STATUS " ONNX_BUILD_TESTS : ${ONNX_BUILD_TESTS}")
message(STATUS " ONNX_BUILD_BENCHMARKS : ${ONNX_BUILD_BENCHMARKS}")
message(STATUS " ONNXIFI_DUMMY_BACKEND : ${ONNXIFI_DUMMY_BACKEND}")
message(STATUS " ONNXIFI_ENABLE_EXT : ${ONNXIFI_ENABLE_EXT}")
message(STATUS "")
message(STATUS " Protobuf compiler : ${PROTOBUF_PROTOC_EXECUTABLE}")
message(STATUS " Protobuf includes : ${PROTOBUF_INCLUDE_DIRS}")
message(STATUS " Protobuf libraries : ${PROTOBUF_LIBRARIES}")
message(STATUS " BUILD_ONNX_PYTHON : ${BUILD_ONNX_PYTHON}")
message(STATUS " Python version : ${Python_VERSION}")
message(STATUS " Python executable : ${Python_EXECUTABLE}")
message(STATUS " Python includes : ${Python_INCLUDE_DIR}")
message(STATUS " Python libraries : ${Python_LIBRARY}")
message(STATUS " PYBIND11 : ${pybind11_FOUND}")
message(STATUS " Pybind11 version : ${pybind11_VERSION}")
message(STATUS " Pybind11 include : ${pybind11_INCLUDE_DIR}")
message(STATUS " Pybind11 includes : ${pybind11_INCLUDE_DIRS}")
message(STATUS " Pybind11 libraries : ${pybind11_LIBRARIES}")
endfunction()
================================================
FILE: audio/paddleaudio/CMakeLists.txt
================================================
add_subdirectory(third_party)
add_subdirectory(src)
================================================
FILE: audio/paddleaudio/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import _extension
from . import backends
from . import compliance
from . import datasets
from . import features
from . import functional
from . import metric
from . import sox_effects
from . import utils
================================================
FILE: audio/paddleaudio/_extension.py
================================================
import contextlib
import ctypes
import os
import sys
import types
import warnings
from pathlib import Path
from ._internal import module_utils as _mod_utils # noqa: F401
# Query `hasattr` only once.
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
'setdlopenflags')
@contextlib.contextmanager
def dl_open_guard():
"""
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
shared library to load custom operators.
"""
if _SET_GLOBAL_FLAGS:
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
yield
if _SET_GLOBAL_FLAGS:
sys.setdlopenflags(old_flags)
def resolve_library_path(path: str) -> str:
return os.path.realpath(path)
class _Ops(types.ModuleType):
#__file__ = '_ops.py'
def __init__(self):
super(_Ops, self).__init__('paddleaudio.ops')
self.loaded_libraries = set()
def load_library(self, path):
"""
Loads a shared library from the given path into the current process.
This allows dynamically loading custom operators. For this,
you should compile your operator and
the static registration code into a shared library object, and then
call ``paddleaudio.ops.load_library('path/to/libcustom.so')`` to load the
shared object.
After the library is loaded, it is added to the
``paddleaudio.ops.loaded_libraries`` attribute, a set that may be inspected
for the paths of all libraries loaded using this function.
Args:
path (str): A path to a shared library to load.
"""
path = resolve_library_path(path)
with dl_open_guard():
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
# Import the shared library into the process, thus running its
# static (global) initialization code in order to register custom
# operators with the JIT.
ctypes.CDLL(path)
self.loaded_libraries.add(path)
_LIB_DIR = Path(__file__).parent / "lib"
def _get_lib_path(lib: str):
suffix = "pyd" if os.name == "nt" else "so"
path = _LIB_DIR / f"{lib}.{suffix}"
return path
def _load_lib(lib: str) -> bool:
"""Load extension module
Note:
In case `paddleaudio` is deployed with `pex` format, the library file
is not in a standard location.
In this case, we expect that `libpaddlleaudio` is available somewhere
in the search path of dynamic loading mechanism, so that importing
`_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
This is the reason why the function should not raising an error when the library
file is not found.
Returns:
bool:
True if the library file is found AND the library loaded without failure.
False if the library file is not found (like in the case where paddlleaudio
is deployed with pex format, thus the shared library file is
in a non-standard location.).
If the library file is found but there is an issue loading the library,
(such as missing dependency) then this function raises the exception as-is.
Raises:
Exception:
If the library file is found, but there is an issue loading the library file,
(when underlying `ctype.DLL` throws an exception), this function will pass
the exception as-is, instead of catching it and returning bool.
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
is not found.
This behavior was chosen because the expected failure case is not recoverable.
If a dependency is missing, then users have to install it.
"""
path = _get_lib_path(lib)
if not path.exists():
warnings.warn("lib path is not exists:" + str(path))
return False
ops.load_library(path)
return True
_FFMPEG_INITIALIZED = False
def _init_ffmpeg():
global _FFMPEG_INITIALIZED
if _FFMPEG_INITIALIZED:
return
if not paddleaudio._paddlleaudio.is_ffmpeg_available():
raise RuntimeError(
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
)
try:
_load_lib("libpaddlleaudio_ffmpeg")
except OSError as err:
raise ImportError(
"FFmpeg libraries are not found. Please install FFmpeg.") from err
import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
paddleaudio._paddlleaudio.ffmpeg_init()
if paddleaudio._paddlleaudio.ffmpeg_get_log_level() > 8:
paddleaudio._paddlleaudio.ffmpeg_set_log_level(8)
_FFMPEG_INITIALIZED = True
def _init_extension():
if not _mod_utils.is_module_available("paddleaudio._paddleaudio"):
warnings.warn(
"paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
return
_load_lib("libpaddleaudio")
# This import is for initializing the methods registered via PyBind11
# This has to happen after the base library is loaded
try:
from paddleaudio import _paddleaudio # noqa
except Exception:
warnings.warn(
"paddleaudio C++ extension is not available. sox_io, sox_effect, kaldi raw feature is not supported!!!")
return
# Because this part is executed as part of `import torchaudio`, we ignore the
# initialization failure.
# If the FFmpeg integration is not properly initialized, then detailed error
# will be raised when client code attempts to import the dedicated feature.
try:
_init_ffmpeg()
except Exception:
pass
ops = _Ops()
_init_extension()
================================================
FILE: audio/paddleaudio/_internal/__init__.py
================================================
================================================
FILE: audio/paddleaudio/_internal/module_utils.py
================================================
import importlib.util
import platform
import warnings
from functools import wraps
from typing import Optional
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py with modification.
def is_module_available(*modules: str) -> bool:
r"""Returns if a top-level module with :attr:`name` exists *without**
importing it. This is generally safer than try-catch block around a
`import X`. It avoids third party libraries breaking assumptions of some of
our tests, e.g., setting multiprocessing start method when imported
(see librosa/#747, torchvision/#544).
"""
return all(importlib.util.find_spec(m) is not None for m in modules)
def requires_module(*modules: str):
"""Decorate function to give error message if invoked without required optional modules.
This decorator is to give better error message to users rather
than raising ``NameError: name 'module' is not defined`` at random places.
"""
missing = [m for m in modules if not is_module_available(m)]
if not missing:
# fall through. If all the modules are available, no need to decorate
def decorator(func):
return func
else:
req = f"module: {missing[0]}" if len(
missing) == 1 else f"modules: {missing}"
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires {req}")
return wrapped
return decorator
def deprecated(direction: str, version: Optional[str]=None):
"""Decorator to add deprecation message
Args:
direction (str): Migration steps to be given to users.
version (str or int): The version when the object will be removed
"""
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = (
f"{func.__module__}.{func.__name__} has been deprecated "
f'and will be removed from {"future" if version is None else version} release. '
f"{direction}")
warnings.warn(message, stacklevel=2)
return func(*args, **kwargs)
return wrapped
return decorator
def is_kaldi_available():
try:
from paddleaudio import _paddleaudio
return True
except Exception:
return False
def requires_kaldi():
if is_kaldi_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires libpaddleaudio build with kaldi")
return wrapped
return decorator
def _check_soundfile_importable():
if not is_module_available("soundfile"):
return False
try:
import soundfile # noqa: F401
return True
except Exception:
warnings.warn(
"Failed to import soundfile. 'soundfile' backend is not available.")
return False
_is_soundfile_importable = _check_soundfile_importable()
def is_soundfile_available():
return _is_soundfile_importable
def requires_soundfile():
if is_soundfile_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires soundfile")
return wrapped
return decorator
def is_sox_available():
try:
from paddleaudio import _paddleaudio
return True
except Exception:
return False
def requires_sox():
if is_sox_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires libpaddleaudio build with sox")
return wrapped
return decorator
================================================
FILE: audio/paddleaudio/backends/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import utils
from .soundfile_backend import depth_convert
from .soundfile_backend import normalize
from .soundfile_backend import resample
from .soundfile_backend import soundfile_load
from .soundfile_backend import soundfile_save
from .soundfile_backend import to_mono
from .utils import get_audio_backend
from .utils import list_audio_backends
from .utils import set_audio_backend
utils._init_audio_backend()
================================================
FILE: audio/paddleaudio/backends/common.py
================================================
# Token from https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
class AudioInfo:
"""return of info function.
This class is used by :ref:`"sox_io" backend` and
:ref:`"soundfile" backend with the new interface`.
:ivar int sample_rate: Sample rate
:ivar int num_frames: The number of frames
:ivar int num_channels: The number of channels
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
or when it cannot be accurately inferred.
:ivar str encoding: Audio encoding
The values encoding can take are one of the following:
* ``PCM_S``: Signed integer linear PCM
* ``PCM_U``: Unsigned integer linear PCM
* ``PCM_F``: Floating point linear PCM
* ``FLAC``: Flac, Free Lossless Audio Codec
* ``ULAW``: Mu-law
* ``ALAW``: A-law
* ``MP3`` : MP3, MPEG-1 Audio Layer III
* ``VORBIS``: OGG Vorbis
* ``AMR_WB``: Adaptive Multi-Rate
* ``AMR_NB``: Adaptive Multi-Rate Wideband
* ``OPUS``: Opus
* ``HTK``: Single channel 16-bit PCM
* ``UNKNOWN`` : None of above
"""
def __init__(
self,
sample_rate: int,
num_frames: int,
num_channels: int,
bits_per_sample: int,
encoding: str, ):
self.sample_rate = sample_rate
self.num_frames = num_frames
self.num_channels = num_channels
self.bits_per_sample = bits_per_sample
self.encoding = encoding
def __str__(self):
return (f"AudioMetaData("
f"sample_rate={self.sample_rate}, "
f"num_frames={self.num_frames}, "
f"num_channels={self.num_channels}, "
f"bits_per_sample={self.bits_per_sample}, "
f"encoding={self.encoding}"
f")")
================================================
FILE: audio/paddleaudio/backends/no_backend.py
================================================
from pathlib import Path
from typing import Callable
from typing import Optional
from typing import Tuple
from typing import Union
from paddle import Tensor
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
def load(
filepath: Union[str, Path],
out: Optional[Tensor]=None,
normalization: Union[bool, float, Callable]=True,
channels_first: bool=True,
num_frames: int=0,
offset: int=0,
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
raise RuntimeError("No audio I/O backend is available.")
def save(filepath: str,
src: Tensor,
sample_rate: int,
precision: int=16,
channels_first: bool=True) -> None:
raise RuntimeError("No audio I/O backend is available.")
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")
================================================
FILE: audio/paddleaudio/backends/soundfile_backend.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import warnings
from typing import Optional
from typing import Tuple
import numpy as np
import paddle
import resampy
import soundfile
from scipy.io import wavfile
from ..utils import depth_convert
from ..utils import ParameterError
from .common import AudioInfo
__all__ = [
'resample',
'to_mono',
'normalize',
'save',
'soundfile_save',
'load',
'soundfile_load',
'info',
]
NORMALMIZE_TYPES = ['linear', 'gaussian']
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
EPS = 1e-8
def resample(y: np.ndarray,
src_sr: int,
target_sr: int,
mode: str='kaiser_fast') -> np.ndarray:
"""Audio resampling.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
src_sr (int): Source sample rate.
target_sr (int): Target sample rate.
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
np.ndarray: `y` resampled to `target_sr`
"""
if mode == 'kaiser_best':
warnings.warn(
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
we recommend the mode kaiser_fast in large scale audio training')
if not isinstance(y, np.ndarray):
raise ParameterError(
'Only support numpy np.ndarray, but received y in {type(y)}')
if mode not in RESAMPLE_MODES:
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
return resampy.resample(y, src_sr, target_sr, filter=mode)
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
"""Convert sterior audio to mono.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
Returns:
np.ndarray: `y` with mono channel.
"""
if merge_type not in MERGE_TYPES:
raise ParameterError(
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
)
if y.ndim > 2:
raise ParameterError(
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
if y.ndim == 1: # nothing to merge
return y
if merge_type == 'ch0':
return y[0]
if merge_type == 'ch1':
return y[1]
if merge_type == 'random':
return y[np.random.randint(0, 2)]
# need to do averaging according to dtype
if y.dtype == 'float32':
y_out = (y[0] + y[1]) * 0.5
elif y.dtype == 'int16':
y_out = y.astype('int32')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
elif y.dtype == 'int8':
y_out = y.astype('int16')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
else:
raise ParameterError(f'Unsupported dtype: {y.dtype}')
return y_out
def soundfile_load_(file: os.PathLike,
offset: Optional[float]=None,
dtype: str='int16',
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
"""Load audio using soundfile library. This function load audio file using libsndfile.
Args:
file (os.PathLike): File of waveform.
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
with soundfile.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate
if offset:
sf_desc.seek(int(offset * sr_native))
if duration is not None:
frame_duration = int(duration * sr_native)
else:
frame_duration = -1
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
return y, sf_desc.samplerate
def normalize(y: np.ndarray, norm_type: str='linear',
mul_factor: float=1.0) -> np.ndarray:
"""Normalize an input audio with additional multiplier.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
Returns:
np.ndarray: `y` after normalization.
"""
if norm_type == 'linear':
amax = np.max(np.abs(y))
factor = 1.0 / (amax + EPS)
y = y * factor * mul_factor
elif norm_type == 'gaussian':
amean = np.mean(y)
astd = np.std(y)
astd = max(astd, EPS)
y = mul_factor * (y - amean) / astd
else:
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
return y
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
sr (int): Sample rate.
file (os.PathLike): Path of audio file to save.
"""
if not file.endswith('.wav'):
raise ParameterError(
f'only .wav file supported, but dst file name is: {file}')
if sr <= 0:
raise ParameterError(
f'Sample rate should be larger than 0, received sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
f'input data type is {y.dtype}, will convert data to int16 format before saving'
)
y_out = depth_convert(y, 'int16')
else:
y_out = y
wavfile.write(file, sr, y_out)
def soundfile_load(
file: os.PathLike,
sr: Optional[int]=None,
mono: bool=True,
merge_type: str='average', # ch0,ch1,random,average
normal: bool=True,
norm_type: str='linear',
norm_mul_factor: float=1.0,
offset: float=0.0,
duration: Optional[int]=None,
dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
"""Load audio file from disk. This function loads audio from disk using using audio backend.
Args:
file (os.PathLike): Path of audio file to load.
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
mono (bool, optional): Return waveform with mono channel. Defaults to True.
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
normal (bool, optional): Waveform normalization. Defaults to True.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
raise ParameterError(f'audio file {file} looks empty')
if mono:
y = to_mono(y, merge_type)
if sr is not None and sr != r:
y = resample(y, r, sr, mode=resample_mode)
r = sr
if normal:
y = normalize(y, norm_type, norm_mul_factor)
elif dtype in ['int8', 'int16']:
# still need to do normalization, before depth conversion
y = normalize(y, 'linear', 1.0)
y = depth_convert(y, dtype)
return y, r
#The code below is taken from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py, with some modifications.
def _get_subtype_for_wav(dtype: paddle.dtype,
encoding: str,
bits_per_sample: int):
if not encoding:
if not bits_per_sample:
subtype = {
paddle.uint8: "PCM_U8",
paddle.int16: "PCM_16",
paddle.int32: "PCM_32",
paddle.float32: "FLOAT",
paddle.float64: "DOUBLE",
}.get(dtype)
if not subtype:
raise ValueError(f"Unsupported dtype for wav: {dtype}")
return subtype
if bits_per_sample == 8:
return "PCM_U8"
return f"PCM_{bits_per_sample}"
if encoding == "PCM_S":
if not bits_per_sample:
return "PCM_32"
if bits_per_sample == 8:
raise ValueError("wav does not support 8-bit signed PCM encoding.")
return f"PCM_{bits_per_sample}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "PCM_U8"
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
if encoding == "PCM_F":
if bits_per_sample in (None, 32):
return "FLOAT"
if bits_per_sample == 64:
return "DOUBLE"
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("wav only supports 8-bit mu-law encoding.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "ALAW"
raise ValueError("wav only supports 8-bit a-law encoding.")
raise ValueError(f"wav does not support {encoding}.")
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
if encoding in (None, "PCM_S"):
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
if encoding in ("PCM_U", "PCM_F"):
raise ValueError(f"sph does not support {encoding} encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("sph only supports 8-bit for mu-law encoding.")
if encoding == "ALAW":
return "ALAW"
raise ValueError(f"sph does not support {encoding}.")
def _get_subtype(dtype: paddle.dtype,
format: str,
encoding: str,
bits_per_sample: int):
if format == "wav":
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
if format == "flac":
if encoding:
raise ValueError("flac does not support encoding.")
if not bits_per_sample:
return "PCM_16"
if bits_per_sample > 24:
raise ValueError("flac does not support bits_per_sample > 24.")
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
if format in ("ogg", "vorbis"):
if encoding or bits_per_sample:
raise ValueError(
"ogg/vorbis does not support encoding/bits_per_sample.")
return "VORBIS"
if format == "sph":
return _get_subtype_for_sphere(encoding, bits_per_sample)
if format in ("nis", "nist"):
return "PCM_16"
raise ValueError(f"Unsupported format: {format}")
def save(
filepath: str,
src: paddle.Tensor,
sample_rate: int,
channels_first: bool=True,
compression: Optional[float]=None,
format: Optional[str]=None,
encoding: Optional[str]=None,
bits_per_sample: Optional[int]=None, ):
"""Save audio data to file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (str or pathlib.Path): Path to audio file.
src (paddle.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reason with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``filepath`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, such as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are:
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
Supported formats/encodings/bit depth/compression are:
``"wav"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note:
Default encoding/bit depth is determined by the dtype of
the input Tensor.
``"flac"``
- 8-bit
- 16-bit (default)
- 24-bit
``"ogg"``, ``"vorbis"``
- Doesn't accept changing configuration.
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
"""
if src.ndim != 2:
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
if compression is not None:
warnings.warn(
'`save` function of "soundfile" backend does not support "compression" parameter. '
"The argument is silently ignored.")
if hasattr(filepath, "write"):
if format is None:
raise RuntimeError(
"`format` is required when saving to file object.")
ext = format.lower()
else:
ext = str(filepath).split(".")[-1].lower()
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
raise ValueError("Invalid bits_per_sample.")
if bits_per_sample == 24:
warnings.warn(
"Saving audio with 24 bits per sample might warp samples near -1. "
"Using 16 bits per sample might be able to avoid this.")
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
# so we extend the extensions manually here
if ext in ["nis", "nist", "sph"] and format is None:
format = "NIST"
if channels_first:
src = src.t()
soundfile.write(
file=filepath,
data=src,
samplerate=sample_rate,
subtype=subtype,
format=format)
_SUBTYPE2DTYPE = {
"PCM_S8": "int8",
"PCM_U8": "uint8",
"PCM_16": "int16",
"PCM_32": "int32",
"FLOAT": "float32",
"DOUBLE": "float64",
}
def load(
filepath: str,
frame_offset: int=0,
num_frames: int=-1,
normalize: bool=True,
channels_first: bool=True,
format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
"""Load audio data from file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype and the shape of `[channel, time]`.
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
by providing ``normalize=False``, this function can return integer Tensor, where the samples
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values normalized to
``[-1.0, 1.0]``.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
Args:
filepath (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
(paddle.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
dtype = "float32"
elif file_.subtype not in _SUBTYPE2DTYPE:
raise ValueError(f"Unsupported subtype: {file_.subtype}")
else:
dtype = _SUBTYPE2DTYPE[file_.subtype]
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, dtype, always_2d=True)
sample_rate = file_.samplerate
waveform = paddle.to_tensor(waveform)
if channels_first:
waveform = paddle.transpose(waveform, perm=[1, 0])
return waveform, sample_rate
# Mapping from soundfile subtype to number of bits per sample.
# This is mostly heuristical and the value is set to 0 when it is irrelevant
# (lossy formats) or when it can't be inferred.
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
# the default seems to be 8 bits but it can be compressed further to 4 bits.
# The dict is inspired from
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
_SUBTYPE_TO_BITS_PER_SAMPLE = {
"PCM_S8": 8, # Signed 8 bit data
"PCM_16": 16, # Signed 16 bit data
"PCM_24": 24, # Signed 24 bit data
"PCM_32": 32, # Signed 32 bit data
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
"FLOAT": 32, # 32 bit float data
"DOUBLE": 64, # 64 bit float data
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"IMA_ADPCM": 0, # IMA ADPCM.
"MS_ADPCM": 0, # Microsoft ADPCM.
"GSM610":
0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
"G721_32": 0, # 32kbs G721 ADPCM encoding.
"G723_24": 0, # 24kbs G723 ADPCM encoding.
"G723_40": 0, # 40kbs G723 ADPCM encoding.
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
"DPCM_8": 8, # 8 bit differential PCM (XI only)
"DPCM_16": 16, # 16 bit differential PCM (XI only)
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
}
def _get_bit_depth(subtype):
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
warnings.warn(
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
"attribute will be set to 0. If you are seeing this warning, please "
"report by opening an issue on github (after checking for existing/closed ones). "
"You may otherwise ignore this warning.")
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
_SUBTYPE_TO_ENCODING = {
"PCM_S8": "PCM_S",
"PCM_16": "PCM_S",
"PCM_24": "PCM_S",
"PCM_32": "PCM_S",
"PCM_U8": "PCM_U",
"FLOAT": "PCM_F",
"DOUBLE": "PCM_F",
"ULAW": "ULAW",
"ALAW": "ALAW",
"VORBIS": "VORBIS",
}
def _get_encoding(format: str, subtype: str):
if format == "FLAC":
return "FLAC"
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
def info(filepath: str, format: Optional[str]=None) -> AudioInfo:
"""Get signal information of an audio file.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (path-like object or file-like object):
Source of audio data.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
AudioInfo: meta data of the given audio.
"""
sinfo = soundfile.info(filepath)
return AudioInfo(
sinfo.samplerate,
sinfo.frames,
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype), )
================================================
FILE: audio/paddleaudio/backends/sox_io_backend.py
================================================
import os
from typing import Optional
from typing import Tuple
import paddle
import paddleaudio
from paddle import Tensor
from paddleaudio._internal import module_utils as _mod_utils
from .common import AudioInfo
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
# Note: need to comply TorchScript syntax -- need annotation and no f-string
def _fail_load(
filepath: str,
frame_offset: int=0,
num_frames: int=-1,
normalize: bool=True,
channels_first: bool=True,
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
raise RuntimeError("Failed to load audio from {}".format(filepath))
def _fail_load_fileobj(fileobj, *args, **kwargs):
raise RuntimeError(f"Failed to load audio from {fileobj}")
_fallback_info = _fail_info
_fallback_info_fileobj = _fail_info_fileobj
_fallback_load = _fail_load
_fallback_load_filebj = _fail_load_fileobj
@_mod_utils.requires_sox()
def load(
filepath: str,
frame_offset: int=0,
num_frames: int=-1,
normalize: bool=True,
channels_first: bool=True,
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
if hasattr(filepath, "read"):
ret = paddleaudio._paddleaudio.load_audio_fileobj(
filepath, frame_offset, num_frames, normalize, channels_first,
format)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load_fileobj(filepath, frame_offset, num_frames,
normalize, channels_first, format)
filepath = os.fspath(filepath)
ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
filepath, frame_offset, num_frames, normalize, channels_first, format)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load(filepath, frame_offset, num_frames, normalize,
channels_first, format)
@_mod_utils.requires_sox()
def save(
filepath: str,
src: Tensor,
sample_rate: int,
channels_first: bool=True,
compression: Optional[float]=None,
format: Optional[str]=None,
encoding: Optional[str]=None,
bits_per_sample: Optional[int]=None, ):
src_arr = src.numpy()
if hasattr(filepath, "write"):
paddleaudio._paddleaudio.save_audio_fileobj(
filepath, src_arr, sample_rate, channels_first, compression, format,
encoding, bits_per_sample)
return
filepath = os.fspath(filepath)
paddleaudio._paddleaudio.sox_io_save_audio_file(
filepath, src_arr, sample_rate, channels_first, compression, format,
encoding, bits_per_sample)
@_mod_utils.requires_sox()
def info(
filepath: str,
format: Optional[str]=None, ) -> AudioInfo:
if hasattr(filepath, "read"):
sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
if sinfo is not None:
return AudioInfo(*sinfo)
return _fallback_info_fileobj(filepath, format)
filepath = os.fspath(filepath)
sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
if sinfo is not None:
return AudioInfo(*sinfo)
return _fallback_info(filepath, format)
================================================
FILE: audio/paddleaudio/backends/utils.py
================================================
"""Defines utilities for switching audio backends"""
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
import warnings
from typing import List
from typing import Optional
import paddleaudio
from paddleaudio._internal import module_utils as _mod_utils
from . import no_backend
from . import soundfile_backend
from . import sox_io_backend
__all__ = [
"list_audio_backends",
"get_audio_backend",
"set_audio_backend",
]
def list_audio_backends() -> List[str]:
"""List available backends
Returns:
List[str]: The list of available backends.
"""
backends = []
if _mod_utils.is_module_available("soundfile"):
backends.append("soundfile")
if _mod_utils.is_sox_available():
backends.append("sox_io")
return backends
def set_audio_backend(backend: Optional[str]):
"""Set the backend for I/O operation
Args:
backend (str or None): Name of the backend.
One of ``"sox_io"`` or ``"soundfile"`` based on availability
of the system. If ``None`` is provided the current backend is unassigned.
"""
if backend is not None and backend not in list_audio_backends():
raise RuntimeError(f'Backend "{backend}" is not one of '
f"available backends: {list_audio_backends()}.")
if backend is None:
module = no_backend
elif backend == "sox_io":
module = sox_io_backend
elif backend == "soundfile":
module = soundfile_backend
else:
raise NotImplementedError(f'Unexpected backend "{backend}"')
for func in ["save", "load", "info"]:
setattr(paddleaudio, func, getattr(module, func))
def _init_audio_backend():
backends = list_audio_backends()
if "soundfile" in backends:
set_audio_backend("soundfile")
elif "sox_io" in backends:
set_audio_backend("sox_io")
else:
warnings.warn("No audio backend is available.")
set_audio_backend(None)
def get_audio_backend() -> Optional[str]:
"""Get the name of the current backend
Returns:
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
"""
if paddleaudio.load == no_backend.load:
return None
if paddleaudio.load == sox_io_backend.load:
return "sox_io"
if paddleaudio.load == soundfile_backend.load:
return "soundfile"
raise ValueError("Unknown backend.")
================================================
FILE: audio/paddleaudio/compliance/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import kaldi
from . import librosa
================================================
FILE: audio/paddleaudio/compliance/kaldi.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from torchaudio(https://github.com/pytorch/audio)
import math
from typing import Tuple
import paddle
from paddle import Tensor
from ..functional import create_dct
from ..functional.window import get_window
__all__ = [
'spectrogram',
'fbank',
'mfcc',
]
# window types
HANNING = 'hann'
HAMMING = 'hamming'
POVEY = 'povey'
RECTANGULAR = 'rect'
BLACKMAN = 'blackman'
def _get_epsilon(dtype):
return paddle.to_tensor(1e-07, dtype=dtype)
def _next_power_of_2(x: int) -> int:
return 1 if x == 0 else 2**(x - 1).bit_length()
def _get_strided(waveform: Tensor,
window_size: int,
window_shift: int,
snip_edges: bool) -> Tensor:
assert waveform.dim() == 1
num_samples = waveform.shape[0]
if snip_edges:
if num_samples < window_size:
return paddle.empty((0, 0), dtype=waveform.dtype)
else:
m = 1 + (num_samples - window_size) // window_shift
else:
reversed_waveform = paddle.flip(waveform, [0])
m = (num_samples + (window_shift // 2)) // window_shift
pad = window_size // 2 - window_shift // 2
pad_right = reversed_waveform
if pad > 0:
pad_left = reversed_waveform[-pad:]
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
else:
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
def _feature_window_function(
window_type: str,
window_size: int,
blackman_coeff: float,
dtype: int, ) -> Tensor:
if window_type == "hann":
return get_window('hann', window_size, fftbins=False, dtype=dtype)
elif window_type == "hamming":
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
elif window_type == "povey":
return get_window(
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
elif window_type == "rect":
return paddle.ones([window_size], dtype=dtype)
elif window_type == "blackman":
a = 2 * math.pi / (window_size - 1)
window_function = paddle.arange(window_size, dtype=dtype)
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
).astype(dtype)
else:
raise Exception('Invalid window type ' + window_type)
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
energy_floor: float) -> Tensor:
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
if energy_floor == 0.0:
return log_energy
return paddle.maximum(
log_energy,
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
def _get_waveform_and_window_properties(
waveform: Tensor,
channel: int,
sr: int,
frame_shift: float,
frame_length: float,
round_to_power_of_two: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
channel = max(channel, 0)
assert channel < waveform.shape[0], (
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
waveform = waveform[channel, :] # size (n)
window_shift = int(
sr * frame_shift *
0.001) # pass frame_shift and frame_length in milliseconds
window_size = int(sr * frame_length * 0.001)
padded_window_size = _next_power_of_2(
window_size) if round_to_power_of_two else window_size
assert 2 <= window_size <= len(waveform), (
'choose a window size {} that is [2, {}]'.format(window_size,
len(waveform)))
assert 0 < window_shift, '`window_shift` must be greater than 0'
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
' use `round_to_power_of_two` or change `frame_length`'
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
assert sr > 0, '`sr` must be greater than zero'
return waveform, window_shift, window_size, padded_window_size
def _get_window(waveform: Tensor,
padded_window_size: int,
window_size: int,
window_shift: int,
window_type: str,
blackman_coeff: float,
snip_edges: bool,
raw_energy: bool,
energy_floor: float,
dither: float,
remove_dc_offset: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
# (m, window_size)
strided_input = _get_strided(waveform, window_size, window_shift,
snip_edges)
if dither != 0.0:
x = paddle.maximum(epsilon,
paddle.rand(strided_input.shape, dtype=dtype))
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
strided_input = strided_input + rand_gauss * dither
if remove_dc_offset:
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
strided_input = strided_input - row_means
if raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # (m)
if preemphasis_coefficient != 0.0:
offset_strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (1, 0),
data_format='NCL',
mode='replicate').squeeze(0) # (m, window_size + 1)
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-1]
window_function = _feature_window_function(
window_type, window_size, blackman_coeff,
dtype).unsqueeze(0) # (1, window_size)
strided_input = strided_input * window_function # (m, window_size)
# (m, padded_window_size)
if padded_window_size != window_size:
padding_right = padded_window_size - window_size
strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (0, padding_right),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
if not raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # size (m)
return strided_input, signal_log_energy
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
if subtract_mean:
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
tensor = tensor - col_means
return tensor
def spectrogram(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
window_type: str="povey") -> Tensor:
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey".
Returns:
Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
depends on frame_length and frame_shift.
"""
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1, 2)
fft = paddle.fft.rfft(strided_input)
power_spectrum = paddle.maximum(
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
power_spectrum[:, 0] = signal_log_energy
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
return power_spectrum
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
def _mel_scale_scalar(freq: float) -> float:
return 1127.0 * math.log(1.0 + freq / 700.0)
def _mel_scale(freq: Tensor) -> Tensor:
return 1127.0 * (1.0 + freq / 700.0).log()
def _vtln_warp_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq: float,
high_freq: float,
vtln_warp_factor: float,
freq: Tensor) -> Tensor:
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
scale = 1.0 / vtln_warp_factor
Fl = scale * l
Fh = scale * h
assert l > low_freq and h < high_freq
scale_left = (Fl - low_freq) / (l - low_freq)
scale_right = (high_freq - Fh) / (high_freq - h)
res = paddle.empty_like(freq)
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
before_l = paddle.less_than(freq, paddle.to_tensor(l))
before_h = paddle.less_than(freq, paddle.to_tensor(h))
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
res[before_h] = scale * freq[before_h]
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
res[outside_low_high_freq] = freq[outside_low_high_freq]
return res
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq,
high_freq: float,
vtln_warp_factor: float,
mel_freq: Tensor) -> Tensor:
return _mel_scale(
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
def _get_mel_banks(num_bins: int,
window_length_padded: int,
sample_freq: float,
low_freq: float,
high_freq: float,
vtln_low: float,
vtln_high: float,
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
assert num_bins > 3, 'Must have at least 3 mel bins'
assert window_length_padded % 2 == 0
num_fft_bins = window_length_padded / 2
nyquist = 0.5 * sample_freq
if high_freq <= 0.0:
high_freq += nyquist
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
fft_bin_width = sample_freq / window_length_padded
mel_low_freq = _mel_scale_scalar(low_freq)
mel_high_freq = _mel_scale_scalar(high_freq)
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
if vtln_high < 0.0:
vtln_high += nyquist
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
('Bad values in options: vtln-low {} and vtln-high {}, versus '
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
bin = paddle.arange(num_bins, dtype=paddle.float32).unsqueeze(1)
# left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
# center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
# right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
center_mel = left_mel + mel_freq_delta
right_mel = center_mel + mel_freq_delta
if vtln_warp_factor != 1.0:
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, left_mel)
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor,
center_mel)
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor, right_mel)
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
# (1, num_fft_bins)
mel = _mel_scale(fft_bin_width * paddle.arange(
num_fft_bins, dtype=paddle.float32)).unsqueeze(0)
# (num_bins, num_fft_bins)
up_slope = (mel - left_mel) / (center_mel - left_mel)
down_slope = (right_mel - mel) / (right_mel - center_mel)
if vtln_warp_factor == 1.0:
bins = paddle.maximum(
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
else:
bins = paddle.zeros_like(up_slope)
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
mel, center_mel)
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
mel, right_mel)
bins[up_idx] = up_slope[up_idx]
bins[down_idx] = down_slope[down_idx]
return bins, center_freqs
def fbank(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
use_log_fbank: bool=True,
use_power: bool=True,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str="povey") -> Tensor:
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`. `C` is in the range [0,1].
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey".
Returns:
Tensor: A filter banks tensor with shape `(m, n_mels)`.
"""
dtype = waveform.dtype
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1)
spectrum = paddle.fft.rfft(strided_input).abs()
if use_power:
spectrum = spectrum.pow(2.)
# (n_mels, padded_window_size // 2)
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
high_freq, vtln_low, vtln_high, vtln_warp)
# mel_energies = mel_energies.astype(dtype)
assert mel_energies.dtype == dtype
# (n_mels, padded_window_size // 2 + 1)
mel_energies = paddle.nn.functional.pad(
mel_energies.unsqueeze(0), (0, 1),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
# (m, n_mels)
mel_energies = paddle.mm(spectrum, mel_energies.T)
if use_log_fbank:
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
if use_energy:
signal_log_energy = signal_log_energy.unsqueeze(1)
if htk_compat:
mel_energies = paddle.concat(
(mel_energies, signal_log_energy), axis=1)
else:
mel_energies = paddle.concat(
(signal_log_energy, mel_energies), axis=1)
# (m, n_mels + 1)
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
return mel_energies
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
return dct_matrix
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
i = paddle.arange(n_mfcc)
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
cepstral_lifter)
def mfcc(waveform: Tensor,
blackman_coeff: float=0.42,
cepstral_lifter: float=22.0,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mfcc: int=13,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str="povey") -> Tensor:
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
"""
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
dtype = waveform.dtype
# (m, n_mels + use_energy)
feature = fbank(
waveform=waveform,
blackman_coeff=blackman_coeff,
channel=channel,
dither=dither,
energy_floor=energy_floor,
frame_length=frame_length,
frame_shift=frame_shift,
high_freq=high_freq,
htk_compat=htk_compat,
low_freq=low_freq,
n_mels=n_mels,
preemphasis_coefficient=preemphasis_coefficient,
raw_energy=raw_energy,
remove_dc_offset=remove_dc_offset,
round_to_power_of_two=round_to_power_of_two,
sr=sr,
snip_edges=snip_edges,
subtract_mean=False,
use_energy=use_energy,
use_log_fbank=True,
use_power=True,
vtln_high=vtln_high,
vtln_low=vtln_low,
vtln_warp=vtln_warp,
window_type=window_type)
if use_energy:
# (m)
signal_log_energy = feature[:, n_mels if htk_compat else 0]
mel_offset = int(not htk_compat)
feature = feature[:, mel_offset:(n_mels + mel_offset)]
# (n_mels, n_mfcc)
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
# (m, n_mfcc)
feature = feature.matmul(dct_matrix)
if cepstral_lifter != 0.0:
# (1, n_mfcc)
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
feature *= lifter_coeffs.astype(dtype=dtype)
if use_energy:
feature[:, 0] = signal_log_energy
if htk_compat:
energy = feature[:, 0].unsqueeze(1) # (m, 1)
feature = feature[:, 1:] # (m, n_mfcc - 1)
if not use_energy:
energy *= math.sqrt(2)
feature = paddle.concat((feature, energy), axis=1)
feature = _subtract_column_mean(feature, subtract_mean)
return feature
================================================
FILE: audio/paddleaudio/compliance/librosa.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import warnings
from typing import List
from typing import Optional
from typing import Union
import numpy as np
import scipy
from numpy.lib.stride_tricks import as_strided
from scipy import signal
from ..backends import depth_convert
from ..utils import ParameterError
__all__ = [
# dsp
'stft',
'mfcc',
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'power_to_db',
'compute_fbank_matrix',
'melspectrogram',
'spectrogram',
'mu_encode',
'mu_decode',
# augmentation
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
]
def _pad_center(data: np.ndarray, size: int, axis: int=-1,
**kwargs) -> np.ndarray:
"""Pad an array to a target length along a target axis.
This differs from `np.pad` by centering the data prior to padding,
analogous to `str.center`
"""
kwargs.setdefault("mode", "constant")
n = data.shape[axis]
lpad = int((size - n) // 2)
lengths = [(0, 0)] * data.ndim
lengths[axis] = (lpad, int(size - n - lpad))
if lpad < 0:
raise ParameterError(("Target size ({size:d}) must be "
"at least input size ({n:d})"))
return np.pad(data, lengths, **kwargs)
def _split_frames(x: np.ndarray,
frame_length: int,
hop_length: int,
axis: int=-1) -> np.ndarray:
"""Slice a data array into (overlapping) frames.
This function is aligned with librosa.frame
"""
if not isinstance(x, np.ndarray):
raise ParameterError(
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
if x.shape[axis] < frame_length:
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
f" for frame_length={frame_length:d}")
if hop_length < 1:
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.asfortranarray(x)
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.ascontiguousarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * new_stride]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * new_stride] + list(strides)
else:
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
return as_strided(x, shape=shape, strides=strides)
def _check_audio(y, mono=True) -> bool:
"""Determine whether a variable contains valid audio data.
The audio y must be a np.ndarray, ether 1-channel or two channel
"""
if not isinstance(y, np.ndarray):
raise ParameterError("Audio data must be of type numpy.ndarray")
if y.ndim > 2:
raise ParameterError(
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
if mono and y.ndim == 2:
raise ParameterError(
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
if not np.issubdtype(y.dtype, np.floating):
raise ParameterError("Audio data must be floating-point")
if not np.isfinite(y).all():
raise ParameterError("Audio buffer is not finite everywhere")
return True
def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
htk: bool=False) -> np.ndarray:
"""Convert Hz to Mels.
Args:
frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequency in mels.
"""
freq = np.asanyarray(frequencies)
if htk:
return 2595.0 * np.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if freq.ndim:
# If we have array data, vectorize
log_t = freq >= min_log_hz
mels[log_t] = min_log_mel + \
np.log(freq[log_t] / min_log_hz) / logstep
elif freq >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
return mels
def mel_to_hz(mels: Union[float, List[float], np.ndarray],
htk: int=False) -> np.ndarray:
"""Convert mel bin numbers to frequencies.
Args:
mels (Union[float, List[float], np.ndarray]): Frequency in mels.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequencies in Hz.
"""
mel_array = np.asanyarray(mels)
if htk:
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel_array
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if mel_array.ndim:
# If we have vector data, vectorize
log_t = mel_array >= min_log_mel
freqs[log_t] = min_log_hz * \
np.exp(logstep * (mel_array[log_t] - min_log_mel))
elif mel_array >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=128,
fmin: float=0.0,
fmax: float=11025.0,
htk: bool=False) -> np.ndarray:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
return mel_to_hz(mels, htk=htk)
def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
Returns:
np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=128,
fmin: float=0.0,
fmax: Optional[float]=None,
htk: bool=False,
norm: str="slaney",
dtype: type=np.float32) -> np.ndarray:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (str, optional): Type of normalization. Defaults to "slaney".
dtype (type, optional): Data type. Defaults to np.float32.
Returns:
np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if norm != "slaney":
raise ParameterError('norm must set to slaney')
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn("Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels.")
return weights
def stft(x: np.ndarray,
n_fft: int=2048,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str="hann",
center: bool=True,
dtype: type=np.complex64,
pad_mode: str="reflect") -> np.ndarray:
"""Short-time Fourier transform (STFT).
Args:
x (np.ndarray): Input waveform in one dimension.
n_fft (int, optional): FFT size. Defaults to 2048.
hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
win_length (Optional[int], optional): The size of window. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
Returns:
np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
"""
_check_audio(x)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = signal.get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = _pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Pad the time series so that frames are centered
if center:
if n_fft > x.shape[-1]:
warnings.warn(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
elif n_fft > x.shape[-1]:
raise ParameterError(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
# Window the time series.
x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
stft_matrix = np.empty(
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
fft = np.fft # use numpy fft as default
# Constrain STFT block sizes to 256 KB
MAX_MEM_BLOCK = 2**8 * 2**10
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
n_columns = max(n_columns, 1)
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.rfft(
fft_window * x_frames[:, bl_s:bl_t], axis=0)
return stft_matrix
def power_to_db(spect: np.ndarray,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0) -> np.ndarray:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (np.ndarray): STFT power spectrogram of an input waveform.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
Returns:
np.ndarray: Power spectrogram in db scale.
"""
spect = np.asarray(spect)
if amin <= 0:
raise ParameterError("amin must be strictly positive")
if np.issubdtype(spect.dtype, np.complexfloating):
warnings.warn(
"power_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call power_to_db(np.abs(D)**2) instead.")
magnitude = np.abs(spect)
else:
magnitude = spect
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError("top_db must be non-negative")
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
def mfcc(x: np.ndarray,
sr: int=16000,
spect: Optional[np.ndarray]=None,
n_mfcc: int=20,
dct_type: int=2,
norm: str="ortho",
lifter: int=0,
**kwargs) -> np.ndarray:
"""Mel-frequency cepstral coefficients (MFCCs)
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
norm (str, optional): Type of normalization. Defaults to "ortho".
lifter (int, optional): Cepstral filtering. Defaults to 0.
Returns:
np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
"""
if spect is None:
spect = melspectrogram(x, sr=sr, **kwargs)
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
if lifter > 0:
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
lifter)
return M * factor[:, np.newaxis]
elif lifter == 0:
return M
else:
raise ParameterError(
f"MFCC lifter={lifter} must be a non-negative number")
def melspectrogram(x: np.ndarray,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
n_mels: int=64,
fmin: float=50.0,
fmax: Optional[float]=None,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0,
to_db: bool=True,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> np.ndarray:
"""Compute mel-spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
n_mels (int, optional): Number of mel bins. Defaults to 64.
fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
to_db (bool, optional): Enable db scale. Defaults to True.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
"""
_check_audio(x, mono=True)
if len(x) <= 0:
raise ParameterError('The input waveform is empty')
if fmax is None:
fmax = sr // 2
if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must satisfy 0 np.ndarray:
"""Compute spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
Returns:
np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
"""
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
return np.abs(s)**power
def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
Args:
x (np.ndarray): The input waveform to encode.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law encoded waveform.
"""
mu = 255
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
if quantized:
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
return y
def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
Args:
y (np.ndarray): The encoded waveform.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law decoded waveform.
"""
if mu < 1:
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
mu = mu - 1
if quantized: # undo the quantization
y = y * 2 / mu - 1
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
return x
def _randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentation
"""
return int(np.random.randint(0, high=high))
def depth_augment(y: np.ndarray,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> np.ndarray:
""" Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
Returns:
np.ndarray: The augmented waveform.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: np.ndarray,
tempo_axis: int=0,
level: float=0.1) -> np.ndarray:
"""Do adaptive spectrogram augmentation. The level of the augmentation is govern by the parameter level, ranging from 0 to 1, with 0 represents no augmentation.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
level (float, optional): The level factor of masking. Defaults to 0.1.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: np.ndarray,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> np.ndarray:
"""Do spectrogram augmentation in both time and freq axis.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
max_freq_mask (int, optional): Maximum number of frequency masking. Defaults to 3.
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
max_freq_mask_width (int, optional): Maximum width of frequency masking. Defaults to 20.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = _randint(max_time_mask)
num_freq_mask = _randint(max_freq_mask)
time_mask_width = _randint(max_time_mask_width)
freq_mask_width = _randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
""" Random cropping on a input waveform.
Args:
y (np.ndarray): Input waveform array in 1D.
crop_len (int): Length of waveform to crop.
Returns:
np.ndarray: The cropped waveform.
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = _randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: np.ndarray, crop_len: int,
tempo_axis: int=0) -> np.ndarray:
""" Random cropping on a spectrogram.
Args:
s (np.ndarray): Input spectrogram in 2D.
crop_len (int): Length of spectrogram to crop.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
Returns:
np.ndarray: The cropped spectrogram.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = _randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out
================================================
FILE: audio/paddleaudio/datasets/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .esc50 import ESC50
from .gtzan import GTZAN
from .hey_snips import HeySnips
from .rirs_noises import OpenRIRNoise
from .tess import TESS
from .urban_sound import UrbanSound8K
from .voxceleb import VoxCeleb
================================================
FILE: audio/paddleaudio/datasets/dataset.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..compliance.kaldi import fbank as kaldi_fbank
from ..compliance.kaldi import mfcc as kaldi_mfcc
from ..compliance.librosa import melspectrogram
from ..compliance.librosa import mfcc
feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
'kaldi_fbank': kaldi_fbank,
'kaldi_mfcc': kaldi_mfcc,
}
class AudioClassificationDataset(paddle.io.Dataset):
"""
Base class of audio classification dataset.
"""
def __init__(self,
files: List[str],
labels: List[int],
feat_type: str='raw',
sample_rate: int=None,
**kwargs):
"""
Args:
files (:obj:`List[str]`): A list of absolute path of audio files.
labels (:obj:`List[int]`): Labels of audio files.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extract of an audio file.
"""
super(AudioClassificationDataset, self).__init__()
if feat_type not in feat_funcs.keys():
raise RuntimeError(
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
)
self.files = files
self.labels = labels
self.feat_type = feat_type
self.sample_rate = sample_rate
self.feat_config = kwargs # Pass keyword arguments to customize feature config
def _get_data(self, input_file: str):
raise NotImplementedError
def _convert_to_record(self, idx):
file, label = self.files[idx], self.labels[idx]
if self.sample_rate is None:
waveform, sample_rate = load_audio(file)
else:
waveform, sample_rate = load_audio(file, sr=self.sample_rate)
feat_func = feat_funcs[self.feat_type]
record = {}
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
record['feat'] = feat_func(
waveform=waveform, sr=self.sample_rate, **self.feat_config)
else:
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
return self.keys[idx], record['feat'], record['label']
else:
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
def __len__(self):
return len(self.files)
================================================
FILE: audio/paddleaudio/datasets/esc50.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['ESC50']
class ESC50(AudioClassificationDataset):
"""
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
suitable for benchmarking methods of environmental sound classification. The dataset
consists of 5-second-long recordings organized into 50 semantical classes (with
40 examples per class)
Reference:
ESC: Dataset for Environmental Sound Classification
http://dx.doi.org/10.1145/2733373.2806390
"""
archives = [
{
'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
'md5': '7771e4b9d86d0945acce719c7a59305a',
},
]
label_list = [
# Animals
'Dog',
'Rooster',
'Pig',
'Cow',
'Frog',
'Cat',
'Hen',
'Insects (flying)',
'Sheep',
'Crow',
# Natural soundscapes & water sounds
'Rain',
'Sea waves',
'Crackling fire',
'Crickets',
'Chirping birds',
'Water drops',
'Wind',
'Pouring water',
'Toilet flush',
'Thunderstorm',
# Human, non-speech sounds
'Crying baby',
'Sneezing',
'Clapping',
'Breathing',
'Coughing',
'Footsteps',
'Laughing',
'Brushing teeth',
'Snoring',
'Drinking, sipping',
# Interior/domestic sounds
'Door knock',
'Mouse click',
'Keyboard typing',
'Door, wood creaks',
'Can opening',
'Washing machine',
'Vacuum cleaner',
'Clock alarm',
'Clock tick',
'Glass breaking',
# Exterior/urban noises
'Helicopter',
'Chainsaw',
'Siren',
'Car horn',
'Engine',
'Train',
'Church bells',
'Airplane',
'Fireworks',
'Hand saw',
]
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
meta_info = collections.namedtuple(
'META_INFO',
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
audio_path = os.path.join('ESC-50-master', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
"""
Args:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extract of an audio file.
"""
files, labels = self._get_data(mode, split)
super(ESC50, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, fold, target, _, _, _, _ = sample
if mode == 'train' and int(fold) != split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels
================================================
FILE: audio/paddleaudio/datasets/gtzan.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['GTZAN']
class GTZAN(AudioClassificationDataset):
"""
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
in machine listening research for music genre recognition (MGR).
Reference:
Musical genre classification of audio signals
https://ieeexplore.ieee.org/document/1021072/
"""
archives = [
{
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
},
]
label_list = [
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
'pop', 'reggae', 'rock'
]
meta = os.path.join('genres', 'input.mf')
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
audio_path = 'genres'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Args:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extract of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(GTZAN, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines():
ret.append(self.meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data
random.shuffle(
meta_info
) # make sure using the same seed to create train and dev dataset
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
file_path, label = sample
filename = os.path.basename(file_path)
target = self.label_list.index(label)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
return files, labels
================================================
FILE: audio/paddleaudio/datasets/hey_snips.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os
from typing import List
from typing import Tuple
from .dataset import AudioClassificationDataset
__all__ = ['HeySnips']
class HeySnips(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('key', 'label', 'duration', 'wav'))
def __init__(self,
data_dir: os.PathLike,
mode: str='train',
feat_type: str='kaldi_fbank',
sample_rate: int=16000,
**kwargs):
self.data_dir = data_dir
files, labels = self._get_data(mode)
super(HeySnips, self).__init__(
files=files,
labels=labels,
feat_type=feat_type,
sample_rate=sample_rate,
**kwargs)
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
'r') as f:
data = json.load(f)
for item in data:
sample = collections.OrderedDict()
if item['duration'] > 0:
sample['key'] = item['id']
sample['label'] = 0 if item['is_hotword'] == 1 else -1
sample['duration'] = item['duration']
sample['wav'] = os.path.join(self.data_dir,
item['audio_file_path'])
ret.append(self.meta_info(*sample.values()))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
meta_info = self._get_meta_info(mode)
files = []
labels = []
self.keys = []
self.durations = []
for sample in meta_info:
key, target, duration, wav = sample
files.append(wav)
labels.append(int(target))
self.keys.append(key)
self.durations.append(float(duration))
return files, labels
================================================
FILE: audio/paddleaudio/datasets/rirs_noises.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import os
import random
from typing import List
from paddle.io import Dataset
from tqdm import tqdm
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..backends.soundfile_backend import soundfile_save as save_wav
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['OpenRIRNoise']
class OpenRIRNoise(Dataset):
archives = [
{
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
'md5': 'e6f48e257286e05de56413b4779d8ffb',
},
]
sample_rate = 16000
meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
base_path = os.path.join(DATA_HOME, 'open_rir_noise')
wav_path = os.path.join(base_path, 'RIRS_NOISES')
csv_path = os.path.join(base_path, 'csv')
subsets = ['rir', 'noise']
def __init__(self,
subset: str='rir',
feat_type: str='raw',
target_dir=None,
random_chunk: bool=True,
chunk_duration: float=3.0,
seed: int=0,
**kwargs):
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
OpenRIRNoise.csv_path = os.path.join(
target_dir, "open_rir_noise",
"csv") if target_dir else self.csv_path
self._data = self._get_data()
super(OpenRIRNoise, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
print(f"rirs noises base path: {self.base_path}")
if not os.path.isdir(self.base_path):
download_and_decompress(
self.archives, self.base_path, decompress=True)
else:
print(
f"{self.base_path} already exists, we will not download and decompress again"
)
# Data preparation.
print(f"prepare the csv to {self.csv_path}")
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav = line.strip().split(',')
data.append(self.meta_info(audio_id, float(duration), wav))
random.shuffle(data)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav'])
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file)
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for idx, chunk in enumerate(uniq_chunks_list):
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
new_wav_file = os.path.join(self.base_path,
audio_id + f'_chunk_{idx+1:02}.wav')
save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
# id, duration, new_wav
ret.append([chunk, self.chunk_duration, new_wav_file])
else: # Keep whole audio.
ret.append([audio_id, audio_duration, wav_file])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav"]
infos = list(
tqdm(
map(self._get_audio_info, wav_files, [split_chunks] * len(
wav_files)),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
"rir_list")
rir_files = []
with open(rir_list, 'r') as f:
for line in f.readlines():
rir_file = line.strip().split(' ')[-1]
rir_files.append(os.path.join(self.base_path, rir_file))
noise_list = os.path.join(self.wav_path, "pointsource_noises",
"noise_list")
noise_files = []
with open(noise_list, 'r') as f:
for line in f.readlines():
noise_file = line.strip().split(' ')[-1]
noise_files.append(os.path.join(self.base_path, noise_file))
self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)
================================================
FILE: audio/paddleaudio/datasets/tess.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['TESS']
class TESS(AudioClassificationDataset):
"""
TESS is a set of 200 target words were spoken in the carrier phrase
"Say the word _____' by two actresses (aged 26 and 64 years) and
recordings were made of the set portraying each of seven emotions(anger,
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
There are 2800 stimuli in total.
Reference:
Toronto emotional speech set (TESS)
https://doi.org/10.5683/SP2/E8H2MF
"""
archives = [
{
'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
'md5':
'1465311b24d1de704c4c63e4ccc470c7',
},
]
label_list = [
'angry',
'disgust',
'fear',
'happy',
'neutral',
'ps', # pleasant surprise
'sad',
]
meta_info = collections.namedtuple('META_INFO',
('speaker', 'word', 'emotion'))
audio_path = 'TESS_Toronto_emotional_speech_set'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Args:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extract of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(TESS, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
for file in files:
basename_without_extend = os.path.basename(file)[:-4]
ret.append(self.meta_info(*basename_without_extend.split('_')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archives, DATA_HOME)
wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
_, _, emotion = sample
target = self.label_list.index(emotion)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(wav_files[idx])
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(wav_files[idx])
labels.append(target)
return files, labels
================================================
FILE: audio/paddleaudio/datasets/urban_sound.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanSound8K']
class UrbanSound8K(AudioClassificationDataset):
"""
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
classes are drawn from the urban sound taxonomy.
Reference:
A Dataset and Taxonomy for Urban Sound Research
https://dl.acm.org/doi/10.1145/2647868.2655045
"""
archives = [
{
'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
'md5': '9aa69802bbf37fb986f71ec1483a196e',
},
]
label_list = [
"air_conditioner", "car_horn", "children_playing", "dog_bark",
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
"street_music"
]
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
meta_info = collections.namedtuple(
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
'class_id', 'label'))
audio_path = os.path.join('UrbanSound8K', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
files, labels = self._get_data(mode, split)
super(UrbanSound8K, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
"""
Args:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extract of an audio file.
"""
def _get_meta_info(self):
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, _, _, _, _, fold, target, _ = sample
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
return files, labels
================================================
FILE: audio/paddleaudio/datasets/voxceleb.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import glob
import os
import random
from multiprocessing import cpu_count
from typing import List
from paddle.io import Dataset
from pathos.multiprocessing import Pool
from tqdm import tqdm
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..utils import DATA_HOME
from ..utils import decompress
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['VoxCeleb']
class VoxCeleb(Dataset):
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
archives_audio_dev = [
{
'url': source_url + 'vox1_dev_wav_partaa',
'md5': 'e395d020928bc15670b570a21695ed96',
},
{
'url': source_url + 'vox1_dev_wav_partab',
'md5': 'bbfaaccefab65d82b21903e81a8a8020',
},
{
'url': source_url + 'vox1_dev_wav_partac',
'md5': '017d579a2a96a077f40042ec33e51512',
},
{
'url': source_url + 'vox1_dev_wav_partad',
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
},
]
archives_audio_test = [
{
'url': source_url + 'vox1_test_wav.zip',
'md5': '185fdc63c3c739954633d50379a3d102',
},
]
archives_meta = [
{
'url':
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
'md5':
'b73110731c9223c1461fe49cb48dddfc',
},
]
num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
sample_rate = 16000
meta_info = collections.namedtuple(
'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
base_path = os.path.join(DATA_HOME, 'vox1')
wav_path = os.path.join(base_path, 'wav')
meta_path = os.path.join(base_path, 'meta')
veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
csv_path = os.path.join(base_path, 'csv')
subsets = ['train', 'dev', 'enroll', 'test']
def __init__(
self,
subset: str='train',
feat_type: str='raw',
random_chunk: bool=True,
chunk_duration: float=3.0, # seconds
split_ratio: float=0.9, # train split ratio
seed: int=0,
target_dir: str=None,
vox2_base_path=None,
**kwargs):
"""VoxCeleb data prepare and get the specific dataset audio info
Args:
subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
random_chunk (bool, optional): random select a duration from audio. Defaults to True.
chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
"""
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.spk_id2label = {}
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
self.split_ratio = split_ratio
self.target_dir = target_dir if target_dir else VoxCeleb.base_path
self.vox2_base_path = vox2_base_path
# if we set the target dir, we will change the vox data info data from base path to target dir
VoxCeleb.csv_path = os.path.join(
target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
VoxCeleb.meta_path = os.path.join(
target_dir, "voxceleb",
'meta') if target_dir else VoxCeleb.meta_path
VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
'veri_test2.txt')
# self._data = self._get_data()[:1000] # KP: Small dataset test.
self._data = self._get_data()
super(VoxCeleb, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
# so, we check the vox1/wav dir status
print(f"wav base path: {self.wav_path}")
if not os.path.isdir(self.wav_path):
print("start to download the voxceleb1 dataset")
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
self.archives_audio_dev,
self.base_path,
decompress=False)
download_and_decompress( # download the vox1_test_wav.zip and unzip
self.archives_audio_test,
self.base_path,
decompress=True)
# Download all parts and concatenate the files into one zip file.
dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
print(f'Concatenating all parts to: {dev_zipfile}')
os.system(
f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
)
# Extract all audio files of dev and test set.
decompress(dev_zipfile, self.base_path)
# Download meta files.
if not os.path.isdir(self.meta_path):
print("prepare the meta data")
download_and_decompress(
self.archives_meta, self.meta_path, decompress=False)
# Data preparation.
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
print(
f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
)
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav, start, stop, spk_id = line.strip(
).split(',')
data.append(
self.meta_info(audio_id,
float(duration), wav,
int(start), int(stop), spk_id))
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
for line in f.readlines():
spk_id, label = line.strip().split(' ')
self.spk_id2label[spk_id] = int(label)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav'])
# random select a chunk audio samples from the audio
if self.random_chunk:
num_wav_samples = waveform.shape[0]
num_chunk_samples = int(self.chunk_duration * sr)
start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
stop = start + num_chunk_samples
else:
start = record['start']
stop = record['stop']
waveform = waveform[start:stop]
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
if self.subset in ['train',
'dev']: # Labels are available in train and dev.
record.update({'label': self.spk_id2label[record['spk_id']]})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file)
spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for chunk in uniq_chunks_list:
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
# id, duration, wav, start, stop, spk_id
ret.append([
chunk, audio_duration, wav_file, start_sample, end_sample,
spk_id
])
else: # Keep whole audio.
ret.append([
audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav", "start", "stop", "spk_id"]
# Note: this may occurs c++ exception, but the program will execute fine
# so we can ignore the exception
with Pool(cpu_count()) as p:
infos = list(
tqdm(
p.imap(lambda x: self._get_audio_info(x, split_chunks),
wav_files),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
# Audio of speakers in veri_test_file should not be included in training set.
print("start to prepare the data csv file")
enroll_files = set()
test_files = set()
# get the enroll and test audio file path
with open(self.veri_test_file, 'r') as f:
for line in f.readlines():
_, enrol_file, test_file = line.strip().split(' ')
enroll_files.add(os.path.join(self.wav_path, enrol_file))
test_files.add(os.path.join(self.wav_path, test_file))
enroll_files = sorted(enroll_files)
test_files = sorted(test_files)
# get the enroll and test speakers
test_spks = set()
for file in (enroll_files + test_files):
spk = file.split('/wav/')[1].split('/')[0]
test_spks.add(spk)
# get all the train and dev audios file path
audio_files = []
speakers = set()
print("Getting file list...")
for path in [self.wav_path, self.vox2_base_path]:
# if vox2 directory is not set and vox2 is not a directory
# we will not process this directory
if not path or not os.path.exists(path):
print(f"{path} is an invalid path, please check again, "
"and we will ignore the vox2 base path")
continue
for file in glob.glob(
os.path.join(path, "**", "*.wav"), recursive=True):
spk = file.split('/wav/')[1].split('/')[0]
if spk in test_spks:
continue
speakers.add(spk)
audio_files.append(file)
print(
f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
)
# encode the train and dev speakers label to spk_id2label.txt
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
for label, spk_id in enumerate(
sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
f.write(f'{spk_id} {label}\n')
audio_files = sorted(audio_files)
random.shuffle(audio_files)
split_idx = int(self.split_ratio * len(audio_files))
# split_ratio to train
train_files, dev_files = audio_files[:split_idx], audio_files[
split_idx:]
self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
self.generate_csv(
enroll_files,
os.path.join(self.csv_path, 'enroll.csv'),
split_chunks=False)
self.generate_csv(
test_files,
os.path.join(self.csv_path, 'test.csv'),
split_chunks=False)
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)
================================================
FILE: audio/paddleaudio/features/__init__.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layers import LogMelSpectrogram
from .layers import MelSpectrogram
from .layers import MFCC
from .layers import Spectrogram
================================================
FILE: audio/paddleaudio/features/layers.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import Optional
from typing import Union
import paddle
import paddle.nn as nn
from paddle import Tensor
from ..functional import compute_fbank_matrix
from ..functional import create_dct
from ..functional import power_to_db
from ..functional.window import get_window
__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]
class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms.
The spectrogram is defined as the complex norm of the short-time Fourier transformation.
Args:
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
dtype: str='float32') -> None:
super(Spectrogram, self).__init__()
assert power > 0, 'Power of spectrogram must be > 0.'
self.power = power
if win_length is None:
win_length = n_fft
self.fft_window = get_window(
window, win_length, fftbins=True, dtype=dtype)
self._stft = partial(
paddle.signal.stft,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=self.fft_window,
center=center,
pad_mode=pad_mode)
self.register_buffer('fft_window', self.fft_window)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
"""
stft = self._stft(x)
spectrogram = paddle.pow(paddle.abs(stft), self.power)
return spectrogram
class MelSpectrogram(nn.Layer):
"""Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> None:
super(MelSpectrogram, self).__init__()
self._spectrogram = Spectrogram(
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
dtype=dtype)
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max
self.htk = htk
self.norm = norm
if f_max is None:
f_max = sr // 2
self.fbank_matrix = compute_fbank_matrix(
sr=sr,
n_fft=n_fft,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype) # float64 for better numerical results
self.register_buffer('fbank_matrix', self.fbank_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
spect_feature = self._spectrogram(x)
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
return mel_feature
class LogMelSpectrogram(nn.Layer):
"""Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str='float32') -> None:
super(LogMelSpectrogram, self).__init__()
self._melspectrogram = MelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype)
self.ref_value = ref_value
self.amin = amin
self.top_db = top_db
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
mel_feature = self._melspectrogram(x)
log_mel_feature = power_to_db(
mel_feature,
ref_value=self.ref_value,
amin=self.amin,
top_db=self.top_db)
return log_mel_feature
class MFCC(nn.Layer):
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_mfcc (int, optional): [description]. Defaults to 40.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_mfcc: int=40,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str=paddle.float32) -> None:
super(MFCC, self).__init__()
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
self._log_melspectrogram = LogMelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
ref_value=ref_value,
amin=amin,
top_db=top_db,
dtype=dtype)
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
self.register_buffer('dct_matrix', self.dct_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
"""
log_mel_feature = self._log_melspectrogram(x)
mfcc = paddle.matmul(
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
(0, 2, 1)) # (B, n_mels, L)
return mfcc
================================================
FILE: audio/paddleaudio/functional/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db
================================================
FILE: audio/paddleaudio/functional/functional.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import math
from typing import Optional
from typing import Union
import paddle
from paddle import Tensor
__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]
def hz_to_mel(freq: Union[Tensor, float],
htk: bool=False) -> Union[Tensor, float]:
"""Convert Hz to Mels.
Args:
freq (Union[Tensor, float]): The input tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[Tensor, float]: Frequency in mels.
"""
if htk:
if isinstance(freq, Tensor):
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
else:
return 2595.0 * math.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(freq, Tensor):
target = min_log_mel + paddle.log(
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
mask = (freq > min_log_hz).astype(freq.dtype)
mels = target * mask + mels * (
1 - mask) # will replace by masked_fill OP in future
else:
if freq >= min_log_hz:
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
return mels
def mel_to_hz(mel: Union[float, Tensor],
htk: bool=False) -> Union[float, Tensor]:
"""Convert mel bin numbers to frequencies.
Args:
mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[float, Tensor]: Frequencies in Hz.
"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(mel, Tensor):
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
mask = (mel > min_log_mel).astype(mel.dtype)
freqs = target * mask + freqs * (
1 - mask) # will replace by masked_fill OP in future
else:
if mel >= min_log_mel:
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=64,
f_min: float=0.0,
f_max: float=11025.0,
htk: bool=False,
dtype: str='float32') -> Tensor:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
max_mel = hz_to_mel(f_max, htk=htk)
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
freqs = mel_to_hz(mels, htk=htk)
return freqs
def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=64,
f_min: float=0.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> Tensor:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if f_max is None:
f_max = float(sr) / 2
# Initialize the weights
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = paddle.maximum(
paddle.zeros_like(lower), paddle.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
if norm == 'slaney':
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm.unsqueeze(1)
elif isinstance(norm, int) or isinstance(norm, float):
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
return weights
def power_to_db(spect: Tensor,
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> Tensor:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (Tensor): STFT power spectrogram.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
Tensor: Power spectrogram in db scale.
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
if ref_value <= 0:
raise Exception("ref_value must be strictly positive")
ones = paddle.ones_like(spect)
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
log_spec -= 10.0 * math.log10(max(ref_value, amin))
if top_db is not None:
if top_db < 0:
raise Exception("top_db must be non-negative")
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
return log_spec
def create_dct(n_mfcc: int,
n_mels: int,
norm: Optional[str]='ortho',
dtype: str='float32') -> Tensor:
"""Create a discrete cosine transform(DCT) matrix.
Args:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (Optional[str], optional): Normalization type. Defaults to 'ortho'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
k) # size (n_mfcc, n_mels)
if norm is None:
dct *= 2.0
else:
assert norm == "ortho"
dct[0] *= 1.0 / math.sqrt(2.0)
dct *= math.sqrt(2.0 / float(n_mels))
return dct.T
================================================
FILE: audio/paddleaudio/functional/window.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import math
from typing import List
from typing import Tuple
from typing import Union
import paddle
from paddle import Tensor
class WindowFunctionRegister(object):
def __init__(self):
self._functions_dict = dict()
def register(self):
def add_subfunction(func):
name = func.__name__
self._functions_dict[name] = func
return func
return add_subfunction
def get(self, name):
return self._functions_dict[name]
window_function_register = WindowFunctionRegister()
@window_function_register.register()
def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x]
return paddle.concat(l)
@window_function_register.register()
def _acosh(x: Union[Tensor, float]) -> Tensor:
if isinstance(x, float):
return math.log(x + math.sqrt(x**2 - 1))
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
@window_function_register.register()
def _extend(M: int, sym: bool) -> bool:
"""Extend window by 1 sample if needed for DFT-even symmetry."""
if not sym:
return M + 1, True
else:
return M, False
@window_function_register.register()
def _len_guards(M: int) -> bool:
"""Handle small or incorrect window lengths."""
if int(M) != M or M < 0:
raise ValueError('Window length M must be a non-negative integer')
return M <= 1
@window_function_register.register()
def _truncate(w: Tensor, needed: bool) -> Tensor:
"""Truncate window by 1 sample if needed for DFT-even symmetry."""
if needed:
return w[:-1]
else:
return w
@window_function_register.register()
def _general_gaussian(
M: int, p, sig, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a window with a generalized Gaussian shape.
This function is consistent with scipy.signal.windows.general_gaussian().
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))
return _truncate(w, needs_trunc)
@window_function_register.register()
def _general_cosine(
M: int, a: float, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M,), dtype=dtype)
for k in range(len(a)):
w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc)
@window_function_register.register()
def _general_hamming(
M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a generalized Hamming window.
This function is consistent with scipy.signal.windows.general_hamming()
"""
return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)
@window_function_register.register()
def _taylor(
M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a Taylor window.
The Taylor window taper function approximates the Dolph-Chebyshev window's
constant sidelobe level for a parameterized number of near-in sidelobes.
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
# Original text uses a negative sidelobe level parameter and then negates
# it in the calculation of B. To keep consistent with other methods we
# assume the sidelobe level parameter to be positive.
B = 10 ** (sll / 20)
A = _acosh(B) / math.pi
s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
ma = paddle.arange(1, nbar, dtype=dtype)
Fm = paddle.empty((nbar - 1,), dtype=dtype)
signs = paddle.empty_like(ma)
signs[::2] = 1
signs[1::2] = -1
m2 = ma * ma
for mi in range(len(ma)):
numer = signs[mi] * paddle.prod(
1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
)
if mi == 0:
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
elif mi == len(ma) - 1:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
else:
denom = (
2
* paddle.prod(1 - m2[mi] / m2[:mi])
* paddle.prod(1 - m2[mi] / m2[mi + 1 :])
)
Fm[mi] = numer / denom
def W(n):
return 1 + 2 * paddle.matmul(
Fm.unsqueeze(0),
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
)
w = W(paddle.arange(0, M, dtype=dtype))
# normalize (Note that this is not described in the original text [1])
if norm:
scale = 1.0 / W((M - 1) / 2)
w *= scale
w = w.squeeze()
return _truncate(w, needs_trunc)
@window_function_register.register()
def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with
non-zero endpoints, optimized to minimize the nearest side lobe.
"""
return _general_hamming(M, 0.54, sym, dtype=dtype)
@window_function_register.register()
def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Hann window.
The Hann window is a taper formed by using a raised cosine or sine-squared
with ends that touch zero.
"""
return _general_hamming(M, 0.5, sym, dtype=dtype)
@window_function_register.register()
def _tukey(
M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a Tukey window.
The Tukey window is also known as a tapered cosine window.
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
if alpha <= 0:
return paddle.ones((M,), dtype=dtype)
elif alpha >= 1.0:
return hann(M, sym=sym)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype)
width = int(alpha * (M - 1) / 2.0)
n1 = n[0 : width + 1]
n2 = n[width + 1 : M - width - 1]
n3 = n[M - width - 1 :]
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
w2 = paddle.ones(n2.shape, dtype=dtype)
w3 = 0.5 * (
1
+ paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
)
w = paddle.concat([w1, w2, w3])
return _truncate(w, needs_trunc)
@window_function_register.register()
def _gaussian(
M: int, std: float, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute a Gaussian window.
The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
sig2 = 2 * std * std
w = paddle.exp(-(n**2) / sig2)
return _truncate(w, needs_trunc)
@window_function_register.register()
def _exponential(
M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
) -> Tensor:
"""Compute an exponential (or Poisson) window."""
if sym and center is not None:
raise ValueError("If sym==True, center must be None.")
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
if center is None:
center = (M - 1) / 2
n = paddle.arange(0, M, dtype=dtype)
w = paddle.exp(-paddle.abs(n - center) / tau)
return _truncate(w, needs_trunc)
@window_function_register.register()
def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a triangular window."""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
if M % 2 == 0:
w = (2 * n - 1.0) / M
w = paddle.concat([w, w[::-1]])
else:
w = 2 * n / (M + 1.0)
w = paddle.concat([w, w[-2::-1]])
return _truncate(w, needs_trunc)
@window_function_register.register()
def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Bohman window.
The Bohman window is the autocorrelation of a cosine window.
"""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
math.pi * fac
)
w = _cat([0, w, 0], dtype)
return _truncate(w, needs_trunc)
@window_function_register.register()
def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a Blackman window.
The Blackman window is a taper formed by using the first three terms of
a summation of cosines. It was designed to have close to the minimal
leakage possible. It is close to optimal, only slightly worse than a
Kaiser window.
"""
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
@window_function_register.register()
def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
"""Compute a window with a simple cosine shape."""
if _len_guards(M):
return paddle.ones((M,), dtype=dtype)
M, needs_trunc = _extend(M, sym)
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))
return _truncate(w, needs_trunc)
def get_window(
window: Union[str, Tuple[str, float]],
win_length: int,
fftbins: bool = True,
dtype: str = 'float64',
) -> Tensor:
"""Return a window of a given length and type.
Args:
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
win_length (int): Number of samples.
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
dtype (str, optional): The data type of the return window. Defaults to 'float64'.
Returns:
Tensor: The window represented as a tensor.
Examples:
.. code-block:: python
import paddle
n_fft = 512
cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
std = 7
gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
"""
sym = not fftbins
args = ()
if isinstance(window, tuple):
winstr = window[0]
if len(window) > 1:
args = window[1:]
elif isinstance(window, str):
if window in ['gaussian', 'exponential']:
raise ValueError(
"The '" + window + "' window needs one or "
"more parameters -- pass a tuple."
)
else:
winstr = window
else:
raise ValueError(
"%s as window type is not supported." % str(type(window))
)
try:
winfunc = window_function_register.get('_' + winstr)
except KeyError as e:
raise ValueError("Unknown window type.") from e
params = (win_length,) + args
kwargs = {'sym': sym}
return winfunc(*params, dtype=dtype, **kwargs)
================================================
FILE: audio/paddleaudio/kaldi/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .kaldi import fbank
#from .kaldi import pitch
================================================
FILE: audio/paddleaudio/kaldi/kaldi.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddleaudio
from paddleaudio._internal import module_utils
__all__ = [
'fbank',
]
@module_utils.requires_kaldi()
def fbank(
wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
dither: float=0.0,
preemph_coeff: float=0.97,
remove_dc_offset: bool=True,
window_type: str='povey',
round_to_power_of_two: bool=True,
blackman_coeff: float=0.42,
snip_edges: bool=True,
max_feature_vectors: int=-1,
num_bins: int=23,
low_freq: float=20,
high_freq: float=0,
vtln_low: float=100,
vtln_high: float=-500,
debug_mel: bool=False,
htk_mode: bool=False,
use_energy: bool=False, # fbank opts
energy_floor: float=0.0,
raw_energy: bool=True,
htk_compat: bool=False,
use_log_fbank: bool=True,
use_power: bool=True):
frame_opts = paddleaudio._paddleaudio.FrameExtractionOptions()
mel_opts = paddleaudio._paddleaudio.MelBanksOptions()
fbank_opts = paddleaudio._paddleaudio.FbankOptions()
frame_opts.samp_freq = samp_freq
frame_opts.frame_shift_ms = frame_shift_ms
frame_opts.frame_length_ms = frame_length_ms
frame_opts.dither = dither
frame_opts.preemph_coeff = preemph_coeff
frame_opts.remove_dc_offset = remove_dc_offset
frame_opts.window_type = window_type
frame_opts.round_to_power_of_two = round_to_power_of_two
frame_opts.blackman_coeff = blackman_coeff
frame_opts.snip_edges = snip_edges
frame_opts.max_feature_vectors = max_feature_vectors
mel_opts.num_bins = num_bins
mel_opts.low_freq = low_freq
mel_opts.high_freq = high_freq
mel_opts.vtln_low = vtln_low
mel_opts.vtln_high = vtln_high
mel_opts.debug_mel = debug_mel
mel_opts.htk_mode = htk_mode
fbank_opts.use_energy = use_energy
fbank_opts.energy_floor = energy_floor
fbank_opts.raw_energy = raw_energy
fbank_opts.htk_compat = htk_compat
fbank_opts.use_log_fbank = use_log_fbank
fbank_opts.use_power = use_power
feat = paddleaudio._paddleaudio.ComputeFbank(frame_opts, mel_opts,
fbank_opts, wav)
return feat
#@module_utils.requires_kaldi()
#def pitch(wav,
#samp_freq: int=16000,
#frame_shift_ms: float=10.0,
#frame_length_ms: float=25.0,
#preemph_coeff: float=0.0,
#min_f0: int=50,
#max_f0: int=400,
#soft_min_f0: float=10.0,
#penalty_factor: float=0.1,
#lowpass_cutoff: int=1000,
#resample_freq: int=4000,
#delta_pitch: float=0.005,
#nccf_ballast: int=7000,
#lowpass_filter_width: int=1,
#upsample_filter_width: int=5,
#max_frames_latency: int=0,
#frames_per_chunk: int=0,
#simulate_first_pass_online: bool=False,
#recompute_frame: int=500,
#nccf_ballast_online: bool=False,
#snip_edges: bool=True):
#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
#pitch_opts.samp_freq = samp_freq
#pitch_opts.frame_shift_ms = frame_shift_ms
#pitch_opts.frame_length_ms = frame_length_ms
#pitch_opts.preemph_coeff = preemph_coeff
#pitch_opts.min_f0 = min_f0
#pitch_opts.max_f0 = max_f0
#pitch_opts.soft_min_f0 = soft_min_f0
#pitch_opts.penalty_factor = penalty_factor
#pitch_opts.lowpass_cutoff = lowpass_cutoff
#pitch_opts.resample_freq = resample_freq
#pitch_opts.delta_pitch = delta_pitch
#pitch_opts.nccf_ballast = nccf_ballast
#pitch_opts.lowpass_filter_width = lowpass_filter_width
#pitch_opts.upsample_filter_width = upsample_filter_width
#pitch_opts.max_frames_latency = max_frames_latency
#pitch_opts.frames_per_chunk = frames_per_chunk
#pitch_opts.simulate_first_pass_online = simulate_first_pass_online
#pitch_opts.recompute_frame = recompute_frame
#pitch_opts.nccf_ballast_online = nccf_ballast_online
#pitch_opts.snip_edges = snip_edges
#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
#return pitch
================================================
FILE: audio/paddleaudio/metric/__init__.py
================================================
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .eer import compute_eer
from .eer import compute_minDCF
================================================
FILE: audio/paddleaudio/metric/eer.py
================================================
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from sklearn.metrics import roc_curve
def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
"""Compute EER and return score threshold.
Args:
labels (np.ndarray): the trial label, shape: [N], one-dimension, N refer to the samples num
scores (np.ndarray): the trial scores, shape: [N], one-dimension, N refer to the samples num
Returns:
List[float]: eer and the specific threshold
"""
fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
fnr = 1 - tpr
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
return eer, eer_threshold
def compute_minDCF(positive_scores,
negative_scores,
c_miss=1.0,
c_fa=1.0,
p_target=0.01):
"""
This is modified from SpeechBrain
https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
Computes the minDCF metric normally used to evaluate speaker verification
systems. The min_DCF is the minimum of the following C_det function computed
within the defined threshold range:
C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
where p_miss is the missing probability and p_fa is the probability of having
a false alarm.
Args:
positive_scores (Paddle.Tensor): The scores from entries of the same class.
negative_scores (Paddle.Tensor): The scores from entries of different classes.
c_miss (float, optional): Cost assigned to a missing error (default 1.0).
c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
p_target (float, optional): Prior probability of having a target (default 0.01).
Returns:
List[float]: min dcf and the specific threshold
"""
# Computing candidate thresholds
if len(positive_scores.shape) > 1:
positive_scores = positive_scores.squeeze()
if len(negative_scores.shape) > 1:
negative_scores = negative_scores.squeeze()
thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
thresholds = paddle.unique(thresholds)
# Adding intermediate thresholds
interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
# Computing False Rejection Rate (miss detection)
positive_scores = paddle.concat(
len(thresholds) * [positive_scores.unsqueeze(0)])
pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
p_miss = (pos_scores_threshold.sum(0)
).astype("float32") / positive_scores.shape[1]
del positive_scores
del pos_scores_threshold
# Computing False Acceptance Rate (false alarm)
negative_scores = paddle.concat(
len(thresholds) * [negative_scores.unsqueeze(0)])
neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
p_fa = (neg_scores_threshold.sum(0)
).astype("float32") / negative_scores.shape[1]
del negative_scores
del neg_scores_threshold
c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
c_min = paddle.min(c_det, axis=0)
min_index = paddle.argmin(c_det, axis=0)
return float(c_min), float(thresholds[min_index])
================================================
FILE: audio/paddleaudio/sox_effects/__init__.py
================================================
from paddleaudio._internal import module_utils as _mod_utils
from .sox_effects import apply_effects_file
from .sox_effects import apply_effects_tensor
from .sox_effects import effect_names
from .sox_effects import init_sox_effects
from .sox_effects import shutdown_sox_effects
if _mod_utils.is_sox_available():
import atexit
init_sox_effects()
atexit.register(shutdown_sox_effects)
__all__ = [
"init_sox_effects",
"shutdown_sox_effects",
"effect_names",
"apply_effects_tensor",
"apply_effects_file",
]
================================================
FILE: audio/paddleaudio/sox_effects/sox_effects.py
================================================
import os
from typing import List
from typing import Optional
from typing import Tuple
import paddle
import paddleaudio
from paddleaudio._internal import module_utils as _mod_utils
from paddleaudio.utils.sox_utils import list_effects
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
@_mod_utils.requires_sox()
def init_sox_effects():
"""Initialize resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
Once initialized, you do not need to call this function again across the multiple uses of
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
again will result in error.
"""
paddleaudio._paddleaudio.sox_effects_initialize_sox_effects()
@_mod_utils.requires_sox()
def shutdown_sox_effects():
"""Clean up resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
It is safe to call this function multiple times.
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
initializing again will result in error.
"""
paddleaudio._paddleaudio.sox_effects_shutdown_sox_effects()
@_mod_utils.requires_sox()
def effect_names() -> List[str]:
"""Gets list of valid sox effect names
Returns:
List[str]: list of available effect names.
Example
>>> paddleaudio.sox_effects.effect_names()
['allpass', 'band', 'bandpass', ... ]
"""
return list(list_effects().keys())
@_mod_utils.requires_sox()
def apply_effects_tensor(
tensor: paddle.Tensor,
sample_rate: int,
effects: List[List[str]],
channels_first: bool=True, ) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to given Tensor
.. devices:: CPU
Note:
This function only works on CPU Tensors.
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` command adds certain effects automatically (such as
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
need to give ``rate`` effect with desired sampling rate.).
Args:
tensor (paddle.Tensor): Input 2D CPU Tensor.
sample_rate (int): Sample rate
effects (List[List[str]]): List of effects.
channels_first (bool, optional): Indicates if the input Tensor's dimension is
`[channels, time]` or `[time, channels]`
Returns:
(Tensor, int): Resulting Tensor and sample rate.
The resulting Tensor has the same ``dtype`` as the input Tensor, and
the same channels order. The shape of the Tensor can be different based on the
effects applied. Sample rate can also be different based on the effects applied.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Generate pseudo wave:
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
>>> sample_rate = 16000
>>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
>>> waveform.shape
paddle.Size([2, 16000])
>>> waveform
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
>>>
>>> # Apply effects
>>> waveform, sample_rate = apply_effects_tensor(
... wave_form, sample_rate, effects, channels_first=True)
>>>
>>> # Check the result
>>> # The new waveform is sampling rate 8000, 1 second.
>>> # normalization and channel order are preserved
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
>>> sample_rate
8000
"""
tensor_np = tensor.numpy()
ret = paddleaudio._paddleaudio.sox_effects_apply_effects_tensor(
tensor_np, sample_rate, effects, channels_first)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to apply sox effect")
@_mod_utils.requires_sox()
def apply_effects_file(
path: str,
effects: List[List[str]],
normalize: bool=True,
channels_first: bool=True,
format: Optional[str]=None, ) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor
Note:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` command adds certain effects automatically (such as
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
rate and leave samples untouched.
Args:
path (path-like object or file-like object):
effects (List[List[str]]): List of effects.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type. This argument has no effect for formats other
than integer WAV type.
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension,
Returns:
(Tensor, int): Resulting Tensor and sample rate.
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
If ``normalize=False`` and the input audio file is of integer WAV file, then the
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
otherwise `[time, channel]`.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Apply effects and load data with channels_first=True
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
>>>
>>> # Check the result
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
-1.4761e-07, 1.8114e-07],
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
-5.6159e-07, 4.8103e-07]])
>>> sample_rate
8000
Example - Apply random speed perturbation to dataset
>>>
>>> # Load data from file, apply random speed perturbation
>>> class RandomPerturbationFile(paddle.utils.data.Dataset):
... \"\"\"Given flist, apply random speed perturbation
...
... Suppose all the input files are at least one second long.
... \"\"\"
... def __init__(self, flist: List[str], sample_rate: int):
... super().__init__()
... self.flist = flist
... self.sample_rate = sample_rate
...
... def __getitem__(self, index):
... speed = 0.5 + 1.5 * random.randn()
... effects = [
... ['gain', '-n', '-10'], # apply 10 db attenuation
... ['remix', '-'], # merge all the channels
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
... ['rate', f'{self.sample_rate}'],
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
... ['trim', '0', '2'], # get the first 2 seconds
... ]
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
... self.flist[index], effects)
... return waveform
...
... def __len__(self):
... return len(self.flist)
...
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
>>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
>>> for batch in loader:
>>> pass
"""
if hasattr(path, "read"):
ret = paddleaudio._paddleaudio.apply_effects_fileobj(
path, effects, normalize, channels_first, format)
if ret is None:
raise RuntimeError("Failed to load audio from {}".format(path))
return (paddle.to_tensor(ret[0]), ret[1])
path = os.fspath(path)
ret = paddleaudio._paddleaudio.sox_effects_apply_effects_file(
path, effects, normalize, channels_first, format)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to load audio from {}".format(path))
================================================
FILE: audio/paddleaudio/src/CMakeLists.txt
================================================
if (MSVC)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
if(APPLE)
set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
endif(APPLE)
################################################################################
# libpaddleaudio
################################################################################
set(
LIBPADDLEAUDIO_SOURCES
utils.cpp
)
set(
LIBPADDLEAUDIO_INCLUDE_DIRS
${PROJECT_SOURCE_DIR}
)
set(
LIBPADDLEAUDIO_LINK_LIBRARIES
)
set(
LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
#------------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
libsox
)
list(
APPEND
LIBPADDLEAUDIO_SOURCES
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_SOX
)
endif()
if(BUILD_KALDI)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
kaldi-native-fbank-core
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_KALDI
COMPILE_WITHOUT_OPENFST
)
endif()
#------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
function (define_library name source include_dirs link_libraries compile_defs)
add_library(${name} SHARED ${source})
target_include_directories(${name} PRIVATE ${include_dirs})
target_link_libraries(${name} ${link_libraries})
target_compile_definitions(${name} PRIVATE ${compile_defs})
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
install(
TARGETS ${name}
LIBRARY DESTINATION lib
RUNTIME DESTINATION lib # For Windows
)
endfunction()
define_library(
libpaddleaudio
"${LIBPADDLEAUDIO_SOURCES}"
"${LIBPADDLEAUDIO_INCLUDE_DIRS}"
"${LIBPADDLEAUDIO_LINK_LIBRARIES}"
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
if (APPLE)
set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
else()
set(AUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
endif()
################################################################################
# _paddleaudio.so
################################################################################
if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
if (WIN32)
find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
set(ADDITIONAL_ITEMS Python3::Python)
endif()
function(define_extension name sources include_dirs libraries definitions)
add_library(${name} SHARED ${sources})
target_compile_definitions(${name} PRIVATE "${definitions}")
target_include_directories(
${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
target_link_libraries(
${name}
${libraries}
${PYTHON_LIBRARY}
${ADDITIONAL_ITEMS}
)
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
if (APPLE)
# https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
# https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif()
install(
TARGETS ${name}
LIBRARY DESTINATION .
RUNTIME DESTINATION . # For Windows
)
endfunction()
set(
EXTENSION_SOURCES
pybind/pybind.cpp
)
#----------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
EXTENSION_SOURCES
pybind/sox/effects.cpp
pybind/sox/effects_chain.cpp
pybind/sox/io.cpp
pybind/sox/types.cpp
pybind/sox/utils.cpp
)
endif()
if(BUILD_KALDI)
list(
APPEND
EXTENSION_SOURCES
pybind/kaldi/kaldi_feature_wrapper.cc
pybind/kaldi/kaldi_feature.cc
)
endif()
#----------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
define_extension(
_paddleaudio
"${EXTENSION_SOURCES}"
""
libpaddleaudio
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
# if(BUILD_CTC_DECODER)
# set(
# DECODER_EXTENSION_SOURCES
# decoder/bindings/pybind.cpp
# )
# define_extension(
# _paddleaudio_decoder
# "${DECODER_EXTENSION_SOURCES}"
# ""
# "libpaddleaudio_decoder"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
# if(USE_FFMPEG)
# set(
# FFMPEG_EXTENSION_SOURCES
# ffmpeg/pybind/typedefs.cpp
# ffmpeg/pybind/pybind.cpp
# ffmpeg/pybind/stream_reader.cpp
# )
# define_extension(
# _paddleaudio_ffmpeg
# "${FFMPEG_EXTENSION_SOURCES}"
# "${FFMPEG_INCLUDE_DIRS}"
# "libpaddleaudio_ffmpeg"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
endif()
================================================
FILE: audio/paddleaudio/src/optional/COPYING
================================================
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.
================================================
FILE: audio/paddleaudio/src/optional/optional.hpp
================================================
///
// optional - An implementation of std::optional with extensions
// Written in 2017 by Sy Brand (tartanllama@gmail.com, @TartanLlama)
//
// Documentation available at https://tl.tartanllama.xyz/
//
// To the extent possible under law, the author(s) have dedicated all
// copyright and related and neighboring rights to this software to the
// public domain worldwide. This software is distributed without any warranty.
//
// You should have received a copy of the CC0 Public Domain Dedication
// along with this software. If not, see
// .
// https://github.com/TartanLlama/optional
///
#ifndef TL_OPTIONAL_HPP
#define TL_OPTIONAL_HPP
#define TL_OPTIONAL_VERSION_MAJOR 1
#define TL_OPTIONAL_VERSION_MINOR 0
#define TL_OPTIONAL_VERSION_PATCH 0
#include
#include
#include
#include
#include
#if (defined(_MSC_VER) && _MSC_VER == 1900)
#define TL_OPTIONAL_MSVC2015
#endif
#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \
!defined(__clang__))
#define TL_OPTIONAL_GCC49
#endif
#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 && \
!defined(__clang__))
#define TL_OPTIONAL_GCC54
#endif
#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 && \
!defined(__clang__))
#define TL_OPTIONAL_GCC55
#endif
#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \
!defined(__clang__))
// GCC < 5 doesn't support overloading on const&& for member functions
#define TL_OPTIONAL_NO_CONSTRR
// GCC < 5 doesn't support some standard C++11 type traits
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
std::has_trivial_copy_constructor::value
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
std::has_trivial_copy_assign::value
// This one will be different for GCC 5.7 if it's ever supported
#define TL_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
std::is_trivially_destructible::value
// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks
// std::vector
// for non-copyable types
#elif (defined(__GNUC__) && __GNUC__ < 8 && !defined(__clang__))
#ifndef TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
#define TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
namespace tl {
namespace detail {
template
struct is_trivially_copy_constructible
: std::is_trivially_copy_constructible {};
#ifdef _GLIBCXX_VECTOR
template
struct is_trivially_copy_constructible>
: std::is_trivially_copy_constructible {};
#endif
}
}
#endif
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
tl::detail::is_trivially_copy_constructible::value
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
std::is_trivially_copy_assignable::value
#define TL_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
std::is_trivially_destructible::value
#else
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
std::is_trivially_copy_constructible::value
#define TL_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
std::is_trivially_copy_assignable::value
#define TL_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
std::is_trivially_destructible::value
#endif
#if __cplusplus > 201103L
#define TL_OPTIONAL_CXX14
#endif
// constexpr implies const in C++11, not C++14
#if (__cplusplus == 201103L || defined(TL_OPTIONAL_MSVC2015) || \
defined(TL_OPTIONAL_GCC49))
#define TL_OPTIONAL_11_CONSTEXPR
#else
#define TL_OPTIONAL_11_CONSTEXPR constexpr
#endif
namespace tl {
#ifndef TL_MONOSTATE_INPLACE_MUTEX
#define TL_MONOSTATE_INPLACE_MUTEX
/// Used to represent an optional with no data; essentially a bool
class monostate {};
/// A tag type to tell optional to construct its value in-place
struct in_place_t {
explicit in_place_t() = default;
};
/// A tag to tell optional to construct its value in-place
static constexpr in_place_t in_place{};
#endif
template
class optional;
namespace detail {
#ifndef TL_TRAITS_MUTEX
#define TL_TRAITS_MUTEX
// C++14-style aliases for brevity
template
using remove_const_t = typename std::remove_const::type;
template
using remove_reference_t = typename std::remove_reference::type;
template
using decay_t = typename std::decay::type;
template
using enable_if_t = typename std::enable_if::type;
template
using conditional_t = typename std::conditional::type;
// std::conjunction from C++17
template
struct conjunction : std::true_type {};
template
struct conjunction : B {};
template
struct conjunction
: std::conditional, B>::type {};
#if defined(_LIBCPP_VERSION) && __cplusplus == 201103L
#define TL_TRAITS_LIBCXX_MEM_FN_WORKAROUND
#endif
// In C++11 mode, there's an issue in libc++'s std::mem_fn
// which results in a hard-error when using it in a noexcept expression
// in some cases. This is a check to workaround the common failing case.
#ifdef TL_TRAITS_LIBCXX_MEM_FN_WORKAROUND
template
struct is_pointer_to_non_const_member_func : std::false_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_pointer_to_non_const_member_func
: std::true_type {};
template
struct is_const_or_const_ref : std::false_type {};
template
struct is_const_or_const_ref : std::true_type {};
template
struct is_const_or_const_ref : std::true_type {};
#endif
// std::invoke from C++17
// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround
template <
typename Fn,
typename... Args,
#ifdef TL_TRAITS_LIBCXX_MEM_FN_WORKAROUND
typename = enable_if_t::value &&
is_const_or_const_ref::value)>,
#endif
typename = enable_if_t>::value>,
int = 0>
constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
noexcept(std::mem_fn(f)(std::forward(args)...)))
-> decltype(std::mem_fn(f)(std::forward(args)...)) {
return std::mem_fn(f)(std::forward(args)...);
}
template >::value>>
constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
noexcept(std::forward(f)(std::forward(args)...)))
-> decltype(std::forward(f)(std::forward(args)...)) {
return std::forward(f)(std::forward(args)...);
}
// std::invoke_result from C++17
template
struct invoke_result_impl;
template
struct invoke_result_impl<
F,
decltype(detail::invoke(std::declval(), std::declval()...), void()),
Us...> {
using type =
decltype(detail::invoke(std::declval(), std::declval()...));
};
template
using invoke_result = invoke_result_impl;
template
using invoke_result_t = typename invoke_result::type;
#if defined(_MSC_VER) && _MSC_VER <= 1900
// TODO make a version which works with MSVC 2015
template
struct is_swappable : std::true_type {};
template
struct is_nothrow_swappable : std::true_type {};
#else
// https://stackoverflow.com/questions/26744589/what-is-a-proper-way-to-implement-is-swappable-to-test-for-the-swappable-concept
namespace swap_adl_tests {
// if swap ADL finds this then it would call std::swap otherwise (same
// signature)
struct tag {};
template
tag swap(T &, T &);
template
tag swap(T (&a)[N], T (&b)[N]);
// helper functions to test if an unqualified swap is possible, and if it
// becomes std::swap
template
std::false_type can_swap(...) noexcept(false);
template (), std::declval()))>
std::true_type can_swap(int) noexcept(noexcept(swap(std::declval(),
std::declval())));
template
std::false_type uses_std(...);
template
std::is_same(), std::declval())), tag>
uses_std(int);
template
struct is_std_swap_noexcept
: std::integral_constant::value &&
std::is_nothrow_move_assignable::value> {};
template
struct is_std_swap_noexcept : is_std_swap_noexcept {};
template
struct is_adl_swap_noexcept
: std::integral_constant(0))> {};
} // namespace swap_adl_tests
template
struct is_swappable
: std::integral_constant<
bool,
decltype(detail::swap_adl_tests::can_swap(0))::value &&
(!decltype(detail::swap_adl_tests::uses_std(0))::value ||
(std::is_move_assignable::value &&
std::is_move_constructible::value))> {};
template
struct is_swappable
: std::integral_constant<
bool,
decltype(detail::swap_adl_tests::can_swap(0))::value &&
(!decltype(
detail::swap_adl_tests::uses_std(0))::value ||
is_swappable::value)> {};
template
struct is_nothrow_swappable
: std::integral_constant<
bool,
is_swappable::value &&
((decltype(detail::swap_adl_tests::uses_std(0))::value
&&detail::swap_adl_tests::is_std_swap_noexcept::value) ||
(!decltype(detail::swap_adl_tests::uses_std(0))::value &&
detail::swap_adl_tests::is_adl_swap_noexcept::value))> {
};
#endif
#endif
// std::void_t from C++17
template
struct voider {
using type = void;
};
template
using void_t = typename voider::type;
// Trait for checking if a type is a tl::optional
template
struct is_optional_impl : std::false_type {};
template
struct is_optional_impl> : std::true_type {};
template
using is_optional = is_optional_impl>;
// Change void to tl::monostate
template
using fixup_void = conditional_t::value, monostate, U>;
template >
using get_map_return = optional>>;
// Check if invoking F for some Us returns void
template
struct returns_void_impl;
template
struct returns_void_impl>, U...>
: std::is_void> {};
template
using returns_void = returns_void_impl;
template
using enable_if_ret_void = enable_if_t::value>;
template
using disable_if_ret_void = enable_if_t::value>;
template
using enable_forward_value =
detail::enable_if_t::value &&
!std::is_same, in_place_t>::value &&
!std::is_same, detail::decay_t>::value>;
template
using enable_from_other = detail::enable_if_t<
std::is_constructible::value &&
!std::is_constructible &>::value &&
!std::is_constructible &&>::value &&
!std::is_constructible &>::value &&
!std::is_constructible &&>::value &&
!std::is_convertible &, T>::value &&
!std::is_convertible &&, T>::value &&
!std::is_convertible &, T>::value &&
!std::is_convertible &&, T>::value>;
template
using enable_assign_forward = detail::enable_if_t<
!std::is_same, detail::decay_t>::value &&
!detail::conjunction,
std::is_same>>::value &&
std::is_constructible::value && std::is_assignable::value>;
template
using enable_assign_from_other = detail::enable_if_t<
std::is_constructible::value &&
std::is_assignable::value &&
!std::is_constructible &>::value &&
!std::is_constructible &&>::value &&
!std::is_constructible &>::value &&
!std::is_constructible &&>::value &&
!std::is_convertible &, T>::value &&
!std::is_convertible &&, T>::value &&
!std::is_convertible &, T>::value &&
!std::is_convertible &&, T>::value &&
!std::is_assignable &>::value &&
!std::is_assignable &&>::value &&
!std::is_assignable &>::value &&
!std::is_assignable &&>::value>;
// The storage base manages the actual storage, and correctly propagates
// trivial destruction from T. This case is for when T is not trivially
// destructible.
template ::value>
struct optional_storage_base {
TL_OPTIONAL_11_CONSTEXPR optional_storage_base() noexcept
: m_dummy(),
m_has_value(false) {}
template
TL_OPTIONAL_11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
: m_value(std::forward(u)...), m_has_value(true) {}
~optional_storage_base() {
if (m_has_value) {
m_value.~T();
m_has_value = false;
}
}
struct dummy {};
union {
dummy m_dummy;
T m_value;
};
bool m_has_value;
};
// This case is for when T is trivially destructible.
template
struct optional_storage_base {
TL_OPTIONAL_11_CONSTEXPR optional_storage_base() noexcept
: m_dummy(),
m_has_value(false) {}
template
TL_OPTIONAL_11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
: m_value(std::forward(u)...), m_has_value(true) {}
// No destructor, so this class is trivially destructible
struct dummy {};
union {
dummy m_dummy;
T m_value;
};
bool m_has_value = false;
};
// This base class provides some handy member functions which can be used in
// further derived classes
template
struct optional_operations_base : optional_storage_base {
using optional_storage_base::optional_storage_base;
void hard_reset() noexcept {
get().~T();
this->m_has_value = false;
}
template
void construct(Args &&... args) noexcept {
new (std::addressof(this->m_value)) T(std::forward(args)...);
this->m_has_value = true;
}
template
void assign(Opt &&rhs) {
if (this->has_value()) {
if (rhs.has_value()) {
this->m_value = std::forward(rhs).get();
} else {
this->m_value.~T();
this->m_has_value = false;
}
}
else if (rhs.has_value()) {
construct(std::forward(rhs).get());
}
}
bool has_value() const { return this->m_has_value; }
TL_OPTIONAL_11_CONSTEXPR T &get() & { return this->m_value; }
TL_OPTIONAL_11_CONSTEXPR const T &get() const & { return this->m_value; }
TL_OPTIONAL_11_CONSTEXPR T &&get() && { return std::move(this->m_value); }
#ifndef TL_OPTIONAL_NO_CONSTRR
constexpr const T &&get() const && { return std::move(this->m_value); }
#endif
};
// This class manages conditionally having a trivial copy constructor
// This specialization is for when T is trivially copy constructible
template
struct optional_copy_base : optional_operations_base {
using optional_operations_base::optional_operations_base;
};
// This specialization is for when T is not trivially copy constructible
template
struct optional_copy_base : optional_operations_base {
using optional_operations_base::optional_operations_base;
optional_copy_base() = default;
optional_copy_base(const optional_copy_base &rhs)
: optional_operations_base() {
if (rhs.has_value()) {
this->construct(rhs.get());
} else {
this->m_has_value = false;
}
}
optional_copy_base(optional_copy_base &&rhs) = default;
optional_copy_base &operator=(const optional_copy_base &rhs) = default;
optional_copy_base &operator=(optional_copy_base &&rhs) = default;
};
// This class manages conditionally having a trivial move constructor
// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
// doesn't implement an analogue to std::is_trivially_move_constructible. We
// have to make do with a non-trivial move constructor even if T is trivially
// move constructible
#ifndef TL_OPTIONAL_GCC49
template ::value>
struct optional_move_base : optional_copy_base {
using optional_copy_base::optional_copy_base;
};
#else
template
struct optional_move_base;
#endif
template
struct optional_move_base : optional_copy_base {
using optional_copy_base::optional_copy_base;
optional_move_base() = default;
optional_move_base(const optional_move_base &rhs) = default;
optional_move_base(optional_move_base &&rhs) noexcept(
std::is_nothrow_move_constructible::value) {
if (rhs.has_value()) {
this->construct(std::move(rhs.get()));
} else {
this->m_has_value = false;
}
}
optional_move_base &operator=(const optional_move_base &rhs) = default;
optional_move_base &operator=(optional_move_base &&rhs) = default;
};
// This class manages conditionally having a trivial copy assignment operator
template
struct optional_copy_assign_base : optional_move_base {
using optional_move_base::optional_move_base;
};
template
struct optional_copy_assign_base : optional_move_base {
using optional_move_base::optional_move_base;
optional_copy_assign_base() = default;
optional_copy_assign_base(const optional_copy_assign_base &rhs) = default;
optional_copy_assign_base(optional_copy_assign_base &&rhs) = default;
optional_copy_assign_base &operator=(const optional_copy_assign_base &rhs) {
this->assign(rhs);
return *this;
}
optional_copy_assign_base &operator=(optional_copy_assign_base &&rhs) =
default;
};
// This class manages conditionally having a trivial move assignment operator
// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
// doesn't implement an analogue to std::is_trivially_move_assignable. We have
// to make do with a non-trivial move assignment operator even if T is trivially
// move assignable
#ifndef TL_OPTIONAL_GCC49
template ::value
&&std::is_trivially_move_constructible::value
&&std::is_trivially_move_assignable::value>
struct optional_move_assign_base : optional_copy_assign_base {
using optional_copy_assign_base::optional_copy_assign_base;
};
#else
template
struct optional_move_assign_base;
#endif
template
struct optional_move_assign_base : optional_copy_assign_base {
using optional_copy_assign_base::optional_copy_assign_base;
optional_move_assign_base() = default;
optional_move_assign_base(const optional_move_assign_base &rhs) = default;
optional_move_assign_base(optional_move_assign_base &&rhs) = default;
optional_move_assign_base &operator=(const optional_move_assign_base &rhs) =
default;
optional_move_assign_base &
operator=(optional_move_assign_base &&rhs) noexcept(
std::is_nothrow_move_constructible::value
&&std::is_nothrow_move_assignable::value) {
this->assign(std::move(rhs));
return *this;
}
};
// optional_delete_ctor_base will conditionally delete copy and move
// constructors depending on whether T is copy/move constructible
template ::value,
bool EnableMove = std::is_move_constructible::value>
struct optional_delete_ctor_base {
optional_delete_ctor_base() = default;
optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
optional_delete_ctor_base &operator=(const optional_delete_ctor_base &) =
default;
optional_delete_ctor_base &operator=(
optional_delete_ctor_base &&) noexcept = default;
};
template
struct optional_delete_ctor_base {
optional_delete_ctor_base() = default;
optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
optional_delete_ctor_base &operator=(const optional_delete_ctor_base &) =
default;
optional_delete_ctor_base &operator=(
optional_delete_ctor_base &&) noexcept = default;
};
template
struct optional_delete_ctor_base {
optional_delete_ctor_base() = default;
optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
optional_delete_ctor_base &operator=(const optional_delete_ctor_base &) =
default;
optional_delete_ctor_base &operator=(
optional_delete_ctor_base &&) noexcept = default;
};
template
struct optional_delete_ctor_base {
optional_delete_ctor_base() = default;
optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
optional_delete_ctor_base &operator=(const optional_delete_ctor_base &) =
default;
optional_delete_ctor_base &operator=(
optional_delete_ctor_base &&) noexcept = default;
};
// optional_delete_assign_base will conditionally delete copy and move
// constructors depending on whether T is copy/move constructible + assignable
template ::value &&
std::is_copy_assignable::value),
bool EnableMove = (std::is_move_constructible::value &&
std::is_move_assignable::value)>
struct optional_delete_assign_base {
optional_delete_assign_base() = default;
optional_delete_assign_base(const optional_delete_assign_base &) = default;
optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
default;
optional_delete_assign_base &operator=(
const optional_delete_assign_base &) = default;
optional_delete_assign_base &operator=(
optional_delete_assign_base &&) noexcept = default;
};
template
struct optional_delete_assign_base {
optional_delete_assign_base() = default;
optional_delete_assign_base(const optional_delete_assign_base &) = default;
optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
default;
optional_delete_assign_base &operator=(
const optional_delete_assign_base &) = default;
optional_delete_assign_base &operator=(
optional_delete_assign_base &&) noexcept = delete;
};
template
struct optional_delete_assign_base {
optional_delete_assign_base() = default;
optional_delete_assign_base(const optional_delete_assign_base &) = default;
optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
default;
optional_delete_assign_base &operator=(
const optional_delete_assign_base &) = delete;
optional_delete_assign_base &operator=(
optional_delete_assign_base &&) noexcept = default;
};
template
struct optional_delete_assign_base {
optional_delete_assign_base() = default;
optional_delete_assign_base(const optional_delete_assign_base &) = default;
optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
default;
optional_delete_assign_base &operator=(
const optional_delete_assign_base &) = delete;
optional_delete_assign_base &operator=(
optional_delete_assign_base &&) noexcept = delete;
};
} // namespace detail
/// A tag type to represent an empty optional
struct nullopt_t {
struct do_not_use {};
constexpr explicit nullopt_t(do_not_use, do_not_use) noexcept {}
};
/// Represents an empty optional
static constexpr nullopt_t nullopt{nullopt_t::do_not_use{},
nullopt_t::do_not_use{}};
class bad_optional_access : public std::exception {
public:
bad_optional_access() = default;
const char *what() const noexcept { return "Optional has no value"; }
};
/// An optional object is an object that contains the storage for another
/// object and manages the lifetime of this contained object, if any. The
/// contained object may be initialized after the optional object has been
/// initialized, and may be destroyed before the optional object has been
/// destroyed. The initialization state of the contained object is tracked by
/// the optional object.
template
class optional : private detail::optional_move_assign_base,
private detail::optional_delete_ctor_base,
private detail::optional_delete_assign_base {
using base = detail::optional_move_assign_base;
static_assert(!std::is_same::value,
"instantiation of optional with in_place_t is ill-formed");
static_assert(!std::is_same, nullopt_t>::value,
"instantiation of optional with nullopt_t is ill-formed");
public:
// The different versions for C++14 and 11 are needed because deduced return
// types are not SFINAE-safe. This provides better support for things like
// generic lambdas. C.f.
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0.html
#if defined(TL_OPTIONAL_CXX14) && !defined(TL_OPTIONAL_GCC49) && \
!defined(TL_OPTIONAL_GCC54) && !defined(TL_OPTIONAL_GCC55)
/// Carries out some operation which returns an optional on the stored
/// object if there is one.
template
TL_OPTIONAL_11_CONSTEXPR auto and_then(F &&f) & {
using result = detail::invoke_result_t