Showing preview only (4,015K chars total). Download the full file or copy to clipboard to get everything.
Repository: mtkresearch/BreezyVoice
Branch: main
Commit: d592c9d3e892
Files: 158
Total size: 3.5 MB
Directory structure:
gitextract_wldclo_i/
├── .dockerignore
├── Dockerfile
├── LICENSE
├── README.md
├── api.py
├── batch_inference.py
├── compose.yaml
├── cosyvoice/
│ ├── __init__.py
│ ├── bin/
│ │ ├── inference.py
│ │ └── train.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── cosyvoice.py
│ │ ├── frontend.py
│ │ └── model.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── processor.py
│ ├── flow/
│ │ ├── decoder.py
│ │ ├── flow.py
│ │ ├── flow_matching.py
│ │ └── length_regulator.py
│ ├── hifigan/
│ │ ├── f0_predictor.py
│ │ └── generator.py
│ ├── llm/
│ │ └── llm.py
│ ├── transformer/
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── attention.py
│ │ ├── convolution.py
│ │ ├── decoder.py
│ │ ├── decoder_layer.py
│ │ ├── embedding.py
│ │ ├── encoder.py
│ │ ├── encoder_layer.py
│ │ ├── label_smoothing_loss.py
│ │ ├── positionwise_feed_forward.py
│ │ └── subsampling.py
│ └── utils/
│ ├── __init__.py
│ ├── class_utils.py
│ ├── common.py
│ ├── executor.py
│ ├── file_utils.py
│ ├── frontend_utils.py
│ ├── mask.py
│ ├── scheduler.py
│ └── train_utils.py
├── data/
│ └── batch_files.csv
├── openai_api_inference.py
├── requirements.txt
├── results/
│ └── .gitkeep
├── run_batch_inference.sh
├── run_single_inference.sh
├── single_inference.py
├── third_party/
│ └── Matcha-TTS/
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── README.md
│ ├── configs/
│ │ ├── __init__.py
│ │ ├── callbacks/
│ │ │ ├── default.yaml
│ │ │ ├── model_checkpoint.yaml
│ │ │ ├── model_summary.yaml
│ │ │ ├── none.yaml
│ │ │ └── rich_progress_bar.yaml
│ │ ├── data/
│ │ │ ├── hi-fi_en-US_female.yaml
│ │ │ ├── ljspeech.yaml
│ │ │ └── vctk.yaml
│ │ ├── debug/
│ │ │ ├── default.yaml
│ │ │ ├── fdr.yaml
│ │ │ ├── limit.yaml
│ │ │ ├── overfit.yaml
│ │ │ └── profiler.yaml
│ │ ├── eval.yaml
│ │ ├── experiment/
│ │ │ ├── hifi_dataset_piper_phonemizer.yaml
│ │ │ ├── ljspeech.yaml
│ │ │ ├── ljspeech_min_memory.yaml
│ │ │ └── multispeaker.yaml
│ │ ├── extras/
│ │ │ └── default.yaml
│ │ ├── hparams_search/
│ │ │ └── mnist_optuna.yaml
│ │ ├── hydra/
│ │ │ └── default.yaml
│ │ ├── local/
│ │ │ └── .gitkeep
│ │ ├── logger/
│ │ │ ├── aim.yaml
│ │ │ ├── comet.yaml
│ │ │ ├── csv.yaml
│ │ │ ├── many_loggers.yaml
│ │ │ ├── mlflow.yaml
│ │ │ ├── neptune.yaml
│ │ │ ├── tensorboard.yaml
│ │ │ └── wandb.yaml
│ │ ├── model/
│ │ │ ├── cfm/
│ │ │ │ └── default.yaml
│ │ │ ├── decoder/
│ │ │ │ └── default.yaml
│ │ │ ├── encoder/
│ │ │ │ └── default.yaml
│ │ │ ├── matcha.yaml
│ │ │ └── optimizer/
│ │ │ └── adam.yaml
│ │ ├── paths/
│ │ │ └── default.yaml
│ │ ├── train.yaml
│ │ └── trainer/
│ │ ├── cpu.yaml
│ │ ├── ddp.yaml
│ │ ├── ddp_sim.yaml
│ │ ├── default.yaml
│ │ ├── gpu.yaml
│ │ └── mps.yaml
│ ├── matcha/
│ │ ├── VERSION
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── cli.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── components/
│ │ │ │ └── __init__.py
│ │ │ └── text_mel_datamodule.py
│ │ ├── hifigan/
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── denoiser.py
│ │ │ ├── env.py
│ │ │ ├── meldataset.py
│ │ │ ├── models.py
│ │ │ └── xutils.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baselightningmodule.py
│ │ │ ├── components/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── decoder.py
│ │ │ │ ├── flow_matching.py
│ │ │ │ ├── text_encoder.py
│ │ │ │ └── transformer.py
│ │ │ └── matcha_tts.py
│ │ ├── onnx/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ └── infer.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ ├── cleaners.py
│ │ │ ├── numbers.py
│ │ │ └── symbols.py
│ │ ├── train.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── generate_data_statistics.py
│ │ ├── instantiators.py
│ │ ├── logging_utils.py
│ │ ├── model.py
│ │ ├── monotonic_align/
│ │ │ ├── __init__.py
│ │ │ ├── core.c
│ │ │ ├── core.pyx
│ │ │ └── setup.py
│ │ ├── pylogger.py
│ │ ├── rich_utils.py
│ │ └── utils.py
│ ├── matcha_tts.egg-info/
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ ├── entry_points.txt
│ │ ├── requires.txt
│ │ └── top_level.txt
│ ├── notebooks/
│ │ └── .gitkeep
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── scripts/
│ │ └── schedule.sh
│ ├── setup.py
│ └── synthesis.ipynb
└── utils/
└── word_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore
.gitattributes
# CI
.codeclimate.yml
.travis.yml
.taskcluster.yml
# Docker
docker-compose.yml
Dockerfile
.docker
.dockerignore
# Byte-compiled / optimized / DLL files
**/__pycache__/
**/*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Virtual environment
.env
.venv/
venv/
# PyCharm
.idea
# Python mode for VIM
.ropeproject
**/.ropeproject
# Vim swap files
**/*.swp
# VS Code
.vscode/
================================================
FILE: Dockerfile
================================================
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
WORKDIR /breezyvoice
ENV UV_LINK_MODE=copy
ENV PATH="/root/.local/bin/:$PATH"
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN apt-get update && \
apt-get install -y --no-install-recommends curl ca-certificates ffmpeg&& \
sh /uv-installer.sh && rm /uv-installer.sh && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
uv venv -p 3.10
COPY requirements.txt /breezyvoice/requirements.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements.txt --index-strategy unsafe-best-match
COPY . .
EXPOSE 8080
ENTRYPOINT ["/breezyvoice/.venv/bin/python"]
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# BreezyVoice
BreezyVoice is a voice-cloning text-to-speech system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities via auxiliary 注音 (bopomofo) inputs. BreezyVoice is partially derived from [CosyVoice](https://github.com/FunAudioLLM/CosyVoice). BreezyVoice is part of the [Breeze2 family](https://huggingface.co/collections/MediaTek-Research/breeze2-family-67863158443a06a72dd29900)
<img src="https://raw.githubusercontent.com/mtkresearch/BreezyVoice/main/images/flowchart.png" alt="flowchart" width="700"/>
🚀 **Try out our interactive [UI playground](https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground) now!** 🚀
🚀 **[立即體驗 BreezyVoice 語音合成](https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground) !** 🚀
Or visit one of these resources:
- [Playground (CLI Inference)](https://www.kaggle.com/code/a24998667/breezyvoice-playground)
- [Model](https://huggingface.co/MediaTek-Research/BreezyVoice/tree/main)
- [Paper](https://arxiv.org/abs/2501.17790)
Repo Main Contributors: Chia-Chun Lin, Chan-Jan Hsu
## Features
🔥 BreezyVoice outperforms competing commercial services in terms of naturalness.
<img src="https://raw.githubusercontent.com/mtkresearch/BreezyVoice/main/images/comparisons.png" alt="comparisons" width="350"/>
🔥 BreezyVoice is highly competitive at code-switching scenarios.
| Code-Switching Term Category | **BreezyVoice** | Z | Y | U | M |
|-------------|--------------|---|---|---|---|
| **General Words** | **8** | 5 | **8** | **8** | 7 |
| **Entities**| **9** | 6 | 4 | 7 | 4 |
| **Abbreviations** | **9** | 8 | 6 | 6 | 7 |
| **Toponyms**| 3 | 3 | **7** | 3 | 4 |
| **Full Sentences**| 7 | 7 | **8** | 5 | 3 |
🔥 BreezyVoice supports automatic 注音 annotation, as well as manual 注音 correction (See Inference).
## Install
**Clone and install**
- Clone the repo
``` sh
git clone https://github.com/mtkresearch/BreezyVoice.git
# If you failed to clone submodule due to network failures, please run following command until success
cd BreezyVoice
```
- Install Requirements (requires Python3.10)
```
pip uninstall onnxruntime # use onnxruntime-gpu instead of onnxruntime
pip install -r requirements.txt
```
(The model is runnable on CPU, please change onnxruntime-gpu to onnxruntime in `requirements.txt` if you do not have GPU in your environment)
You might need to install cudnn depending on cuda version
```
sudo apt-get -y install cudnn9-cuda-11
```
## Inference
UTF8 encoding is required:
``` sh
export PYTHONUTF8=1
```
---
**Run single_inference.py with the following arguments:**
- `--content_to_synthesize`:
- **Description**: Specifies the content that will be synthesized into speech. Phonetic symbols can optionally be included but should be used sparingly, as shown in the examples below:
- Simple text: `"今天天氣真好"`
- Text with phonetic symbols: `"今天天氣真好[:ㄏㄠ3]"`
- `--speaker_prompt_audio_path`:
- **Description**: Specifies the path to the prompt speech audio file for setting the style of the speaker. Use your custom audio file or our example file:
- Example audio: `./data/tc_speaker.wav`
- `--speaker_prompt_text_transcription` (optional):
- **Description**: Specifies the transcription of the speaker prompt audio. Providing this input is highly recommended for better accuracy. If not provided, the system will automatically transcribe the audio using Whisper.
- Example text for the audio file: `"在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程才能將密文還原為正常可讀的內容。"`
- `--output_path` (optional):
- **Description**: Specifies the name and path for the output `.wav` file. If not provided, the default path is used.
- **Default Value**: `results/output.wav`
- Example: `[your_file_name].wav`
- `--model_path` (optional):
- **Description**: Specifies the pre-trained model used for speech synthesis.
- **Default Value**: `MediaTek-Research/BreezyVoice`
**Example Usage:**
``` bash
bash run_single_inference.sh
```
``` python
# python single_inference.py --text_to_speech [text to be converted into audio] --text_prompt [the prompt of that audio file] --audio_path [reference audio file]
python single_inference.py --content_to_synthesize "今天天氣真好" --speaker_prompt_text_transcription "在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程才能將密文還原為正常可讀的內容。" --speaker_prompt_audio_path "./data/example.wav"
```
``` python
# python single_inference.py --text_to_speech [text to be converted into audio] --audio_path [reference audio file]
python single_inference.py --content_to_synthesize "今天天氣真好[:ㄏㄠ3]" --speaker_prompt_audio_path "./data/example.wav"
```
---
**Run `batch_inference.py` with the following arguments:**
- `--csv_file`:
- **Description**: Path to the CSV file that contains the input data for batch processing.
- **Example**: `./data/batch_files.csv`
- `--speaker_prompt_audio_folder`:
- **Description**: Path to the folder containing the speaker prompt audio files. The files in this folder are used to set the style of the speaker for each synthesis task.
- **Example**: `./data`
- `--output_audio_folder`:
- **Description**: Path to the folder where the output audio files will be saved. Each processed row in the CSV will result in a synthesized audio file stored in this folder.
- **Example**: `./results`
**CSV File Structure:**
The CSV file should contain the following columns:
- **`speaker_prompt_audio_filename`**:
- **Description**: The filename (without extension) of the speaker prompt audio file that will be used to guide the style of the generated speech.
- **Example**: `example`
- **`speaker_prompt_text_transcription`**:
- **Description**: The transcription of the speaker prompt audio. This field is optional but highly recommended to improve transcription accuracy. If not provided, the system will attempt to transcribe the audio using Whisper.
- **Example**: `"在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。"`
- **`content_to_synthesize`**:
- **Description**: The content that will be synthesized into speech. You can include phonetic symbols if needed, though they should be used sparingly.
- **Example**: `"今天天氣真好"`
- **`output_audio_filename`**:
- **Description**: The filename (without extension) for the generated output audio. The audio will be saved as a `.wav` file in the output folder.
- **Example**: `output`
**Example Usage:**
``` bash
bash run_batch_inference.sh
```
```bash
python batch_inference.py \
--csv_file ./data/batch_files.csv \
--speaker_prompt_audio_folder ./data \
--output_audio_folder ./results
```
### Docker and OpenAI Compatible API
``` bash
$ docker compose up -d --build
# after the container is up
$ pip install openai
$ python openai_api_inference.py
```
---
If you like our work, please cite:
```
@article{hsu2025breezyvoice,
title={BreezyVoice: Adapting TTS for Taiwanese Mandarin with Enhanced Polyphone Disambiguation--Challenges and Insights},
author={Hsu, Chan-Jan and Lin, Yi-Cheng and Lin, Chia-Chun and Chen, Wei-Chih and Chung, Ho Lam and Li, Chen-An and Chen, Yi-Chang and Yu, Chien-Yu and Lee, Ming-Ji and Chen, Chien-Cheng and others},
journal={arXiv preprint arXiv:2501.17790},
year={2025}
}
@article{hsu2025breeze,
title={The Breeze 2 Herd of Models: Traditional Chinese LLMs Based on Llama with Vision-Aware and Function-Calling Capabilities},
author={Hsu, Chan-Jan and Liu, Chia-Sheng and Chen, Meng-Hsi and Chen, Muxi and Hsu, Po-Chun and Chen, Yi-Chang and Shiu, Da-Shan},
journal={arXiv preprint arXiv:2501.13921},
year={2025}
}
@article{du2024cosyvoice,
title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
journal={arXiv preprint arXiv:2407.05407},
year={2024}
}
```
================================================
FILE: api.py
================================================
# OpenAI API Spec. Reference: https://platform.openai.com/docs/api-reference/audio/createSpeech
from contextlib import asynccontextmanager
from io import BytesIO
import torchaudio
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from g2pw import G2PWConverter
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings
from cosyvoice.utils.file_utils import load_wav
from single_inference import CustomCosyVoice, get_bopomofo_rare
class Settings(BaseSettings):
api_key: str = Field(
default="", description="Specifies the API key used to authenticate the user."
)
model_path: str = Field(
default="MediaTek-Research/BreezyVoice",
description="Specifies the model used for speech synthesis.",
)
speaker_prompt_audio_path: str = Field(
default="./data/example.wav",
description="Specifies the path to the prompt speech audio file of the speaker.",
)
speaker_prompt_text_transcription: str = Field(
default="在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。",
description="Specifies the transcription of the speaker prompt audio.",
)
class SpeechRequest(BaseModel):
model: str = ""
input: str = Field(
description="The content that will be synthesized into speech. You can include phonetic symbols if needed, though they should be used sparingly.",
examples=["今天天氣真好"],
)
response_format: str = ""
speed: float = 1.0
@asynccontextmanager
async def lifespan(app: FastAPI):
app.state.settings = Settings()
app.state.cosyvoice = CustomCosyVoice(app.state.settings.model_path)
app.state.bopomofo_converter = G2PWConverter()
app.state.prompt_speech_16k = load_wav(
app.state.settings.speaker_prompt_audio_path, 16000
)
yield
del app.state.cosyvoice
del app.state.bopomofo_converter
app = FastAPI(lifespan=lifespan, root_path="/v1")
@app.get("/models")
async def get_models(request: Request):
return {
"object": "list",
"data": [
{
"id": request.app.state.settings.model_path,
"object": "model",
"created": 0,
"owned_by": "local",
}
],
}
@app.post("/audio/speech")
async def speach_endpoint(request: Request, payload: SpeechRequest):
# normalization
speaker_prompt_text_transcription = (
request.app.state.cosyvoice.frontend.text_normalize_new(
request.app.state.settings.speaker_prompt_text_transcription, split=False
)
)
content_to_synthesize = request.app.state.cosyvoice.frontend.text_normalize_new(
payload.input, split=False
)
speaker_prompt_text_transcription_bopomo = get_bopomofo_rare(
speaker_prompt_text_transcription, request.app.state.bopomofo_converter
)
content_to_synthesize_bopomo = get_bopomofo_rare(
content_to_synthesize, request.app.state.bopomofo_converter
)
output = request.app.state.cosyvoice.inference_zero_shot_no_normalize(
content_to_synthesize_bopomo,
speaker_prompt_text_transcription_bopomo,
request.app.state.prompt_speech_16k,
)
audio_buffer = BytesIO()
torchaudio.save(audio_buffer, output["tts_speech"], 22050, format="wav")
audio_buffer.seek(0)
return StreamingResponse(
audio_buffer,
media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=output.wav"},
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("api:app", host="0.0.0.0", port=8080)
================================================
FILE: batch_inference.py
================================================
import os
import time
import subprocess
import argparse
import pandas as pd
from datasets import Dataset
from single_inference import single_inference, CustomCosyVoice
from g2pw import G2PWConverter
def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_folder, model):
# Load CSV with pandas
data = pd.read_csv(csv_file)
# Transform pandas DataFrame to HuggingFace Dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.shuffle(seed = int(time.time()*1000))
cosyvoice, bopomofo_converter = model
def gen_audio(row):
speaker_prompt_audio_path = os.path.join(speaker_prompt_audio_folder, f"{row['speaker_prompt_audio_filename']}.wav")
speaker_prompt_text_transcription = row['speaker_prompt_text_transcription']
content_to_synthesize = row['content_to_synthesize']
output_audio_path = os.path.join(output_audio_folder, f"{row['output_audio_filename']}.wav")
if not os.path.exists(speaker_prompt_audio_path):
print(f"File {speaker_prompt_audio_path} does not exist")
return row #{"status": "failed", "reason": "file not found"}
if not os.path.exists(output_audio_path):
single_inference(speaker_prompt_audio_path, content_to_synthesize, output_audio_path, cosyvoice, bopomofo_converter, speaker_prompt_text_transcription)
else:
pass
# command = [
# "python", "single_inference.py",
# "--speaker_prompt_audio_path", speaker_prompt_audio_path,
# "--speaker_prompt_text_transcription", speaker_prompt_text_transcription,
# "--content_to_synthesize", content_to_synthesize,
# "--output_path", output_audio_path
# ]
# try:
# print(f"Processing: {speaker_prompt_audio_path}")
# subprocess.run(command, check=True)
# print(f"Generated: {output_audio_path}")
# return row #{"status": "success", "output": gen_voice_file_name}
# except subprocess.CalledProcessError as e:
# print(f"Failed to generate {speaker_prompt_audio_path}, error: {e}")
# return row #{"status": "failed", "reason": str(e)}
dataset = dataset.map(gen_audio, num_proc = 1)
def main():
parser = argparse.ArgumentParser(description="Batch process audio generation.")
parser.add_argument("--csv_file", required=True, help="Path to the CSV file containing input data.")
parser.add_argument("--speaker_prompt_audio_folder", required=True, help="Path to the folder containing speaker prompt audio files.")
parser.add_argument("--output_audio_folder", required=True, help="Path to the folder where results will be stored.")
parser.add_argument("--model_path", type=str, required=False, default = "MediaTek-Research/BreezyVoice-300M",help="Specifies the model used for speech synthesis.")
args = parser.parse_args()
cosyvoice = CustomCosyVoice(args.model_path)
bopomofo_converter = G2PWConverter()
os.makedirs(args.output_audio_folder, exist_ok=True)
process_batch(
csv_file=args.csv_file,
speaker_prompt_audio_folder=args.speaker_prompt_audio_folder,
output_audio_folder=args.output_audio_folder,
model = (cosyvoice, bopomofo_converter),
)
if __name__ == "__main__":
main()
================================================
FILE: compose.yaml
================================================
services:
app:
image: breezyvoice:latest
build: .
ports:
- "8080:8080"
volumes:
- hf-cache:/root/.cache/huggingface/
command: api.py
init: true
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
volumes:
hf-cache:
================================================
FILE: cosyvoice/__init__.py
================================================
================================================
FILE: cosyvoice/bin/inference.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
import os
import torch
from torch.utils.data import DataLoader
import torchaudio
from hyperpyyaml import load_hyperpyyaml
from tqdm import tqdm
from cosyvoice.cli.model import CosyVoiceModel
from cosyvoice.dataset.dataset import Dataset
def get_args():
parser = argparse.ArgumentParser(description='inference with your model')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument('--prompt_data', required=True, help='prompt data file')
parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
parser.add_argument('--tts_text', required=True, help='tts input file')
parser.add_argument('--llm_model', required=True, help='llm model file')
parser.add_argument('--flow_model', required=True, help='flow model file')
parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
parser.add_argument('--gpu',
type=int,
default=-1,
help='gpu id for this rank, -1 for cpu')
parser.add_argument('--mode',
default='sft',
choices=['sft', 'zero_shot'],
help='inference mode')
parser.add_argument('--result_dir', required=True, help='asr result file')
args = parser.parse_args()
print(args)
return args
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
# Init cosyvoice models from configs
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f)
model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
model.load(args.llm_model, args.flow_model, args.hifigan_model)
test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
del configs
os.makedirs(args.result_dir, exist_ok=True)
fn = os.path.join(args.result_dir, 'wav.scp')
f = open(fn, 'w')
with torch.no_grad():
for batch_idx, batch in tqdm(enumerate(test_data_loader)):
utts = batch["utts"]
assert len(utts) == 1, "inference mode only support batchsize 1"
text = batch["text"]
text_token = batch["text_token"].to(device)
text_token_len = batch["text_token_len"].to(device)
tts_text = batch["tts_text"]
tts_index = batch["tts_index"]
tts_text_token = batch["tts_text_token"].to(device)
tts_text_token_len = batch["tts_text_token_len"].to(device)
speech_token = batch["speech_token"].to(device)
speech_token_len = batch["speech_token_len"].to(device)
speech_feat = batch["speech_feat"].to(device)
speech_feat_len = batch["speech_feat_len"].to(device)
utt_embedding = batch["utt_embedding"].to(device)
spk_embedding = batch["spk_embedding"].to(device)
if args.mode == 'sft':
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
else:
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
'prompt_text': text_token, 'prompt_text_len': text_token_len,
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
model_output = model.inference(**model_input)
tts_key = '{}_{}'.format(utts[0], tts_index[0])
tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
f.write('{} {}\n'.format(tts_key, tts_fn))
f.flush()
f.close()
logging.info('Result wav.scp saved in {}'.format(fn))
if __name__ == '__main__':
main()
================================================
FILE: cosyvoice/bin/train.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import datetime
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from copy import deepcopy
import torch
import torch.distributed as dist
import deepspeed
from hyperpyyaml import load_hyperpyyaml
from torch.distributed.elastic.multiprocessing.errors import record
from cosyvoice.utils.executor import Executor
from cosyvoice.utils.train_utils import (
init_distributed,
init_dataset_and_dataloader,
init_optimizer_and_scheduler,
init_summarywriter, save_model,
wrap_cuda_model, check_modify_and_save_config)
def get_args():
parser = argparse.ArgumentParser(description='training your network')
parser.add_argument('--train_engine',
default='torch_ddp',
choices=['torch_ddp', 'deepspeed'],
help='Engine for paralleled training')
parser.add_argument('--model', required=True, help='model which will be trained')
parser.add_argument('--config', required=True, help='config file')
parser.add_argument('--train_data', required=True, help='train data file')
parser.add_argument('--cv_data', required=True, help='cv data file')
parser.add_argument('--checkpoint', help='checkpoint model')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--tensorboard_dir',
default='tensorboard',
help='tensorboard log dir')
parser.add_argument('--ddp.dist_backend',
dest='dist_backend',
default='nccl',
choices=['nccl', 'gloo'],
help='distributed backend')
parser.add_argument('--num_workers',
default=0,
type=int,
help='num of subprocess workers for reading')
parser.add_argument('--prefetch',
default=100,
type=int,
help='prefetch number')
parser.add_argument('--pin_memory',
action='store_true',
default=False,
help='Use pinned memory buffers used for reading')
parser.add_argument('--deepspeed.save_states',
dest='save_states',
default='model_only',
choices=['model_only', 'model+optimizer'],
help='save model/optimizer states')
parser.add_argument('--timeout',
default=30,
type=int,
help='timeout (in seconds) of cosyvoice_join.')
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args()
return args
@record
def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
with open(args.config, 'r') as f:
configs = load_hyperpyyaml(f, overrides=override_dict)
configs['train_conf'].update(vars(args))
# Init env for ddp
init_distributed(args)
# Get dataset & dataloader
train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
init_dataset_and_dataloader(args, configs)
# Do some sanity checks and save config to arsg.model_dir
configs = check_modify_and_save_config(args, configs)
# Tensorboard summary
writer = init_summarywriter(args)
# load checkpoint
model = configs[args.model]
if args.checkpoint is not None:
model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
# Dispatch model from cpu to gpu
model = wrap_cuda_model(args, model)
# Get optimizer & scheduler
model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
# Save init checkpoints
info_dict = deepcopy(configs['train_conf'])
save_model(model, 'init', info_dict)
# Get executor
executor = Executor()
# Start training loop
for epoch in range(info_dict['max_epoch']):
executor.epoch = epoch
train_dataset.set_epoch(epoch)
dist.barrier()
group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
dist.destroy_process_group(group_join)
if __name__ == '__main__':
main()
================================================
FILE: cosyvoice/cli/__init__.py
================================================
================================================
FILE: cosyvoice/cli/cosyvoice.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from hyperpyyaml import load_hyperpyyaml
from huggingface_hub import snapshot_download
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
from cosyvoice.cli.model import CosyVoiceModel
class CosyVoice:
def __init__(self, model_dir):
instruct = True if '-Instruct' in model_dir else False
self.model_dir = model_dir
if not os.path.exists(model_dir):
model_dir = snapshot_download(model_dir)
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
configs = load_hyperpyyaml(f)
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
configs['feat_extractor'],
'{}/campplus.onnx'.format(model_dir),
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
'{}/spk2info.pt'.format(model_dir),
instruct,
configs['allowed_special'])
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
self.model.load('{}/llm.pt'.format(model_dir),
'{}/flow.pt'.format(model_dir),
'{}/hift.pt'.format(model_dir))
del configs
def list_avaliable_spks(self):
spks = list(self.frontend.spk2info.keys())
return spks
def inference_sft(self, tts_text, spk_id):
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_sft(i, spk_id)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}
def inference_cross_lingual(self, tts_text, prompt_speech_16k):
if self.frontend.instruct is True:
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}
def inference_instruct(self, tts_text, spk_id, instruct_text):
if self.frontend.instruct is False:
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
tts_speeches = []
for i in self.frontend.text_normalize(tts_text, split=True):
model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
model_output = self.model.inference(**model_input)
tts_speeches.append(model_output['tts_speech'])
return {'tts_speech': torch.concat(tts_speeches, dim=1)}
================================================
FILE: cosyvoice/cli/frontend.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
import onnxruntime
import torch
import numpy as np
import whisper
from typing import Callable
import torchaudio.compliance.kaldi as kaldi
import torchaudio
import os
import re
import inflect
try:
import ttsfrd
use_ttsfrd = True
except ImportError:
print("failed to import ttsfrd, use WeTextProcessing instead")
from tn.chinese.normalizer import Normalizer as ZhNormalizer
from tn.english.normalizer import Normalizer as EnNormalizer
use_ttsfrd = False
from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
class CosyVoiceFrontEnd:
def __init__(self,
get_tokenizer: Callable,
feat_extractor: Callable,
model_dir: str,
campplus_model: str,
speech_tokenizer_model: str,
spk2info: str = '',
instruct: bool = False,
allowed_special: str = 'all'):
self.tokenizer = get_tokenizer()
self.feat_extractor = feat_extractor
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
option = onnxruntime.SessionOptions()
option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
option.intra_op_num_threads = 1
self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"])
if os.path.exists(spk2info):
self.spk2info = torch.load(spk2info, map_location=self.device)
self.instruct = instruct
self.allowed_special = allowed_special
self.inflect_parser = inflect.engine()
self.use_ttsfrd = use_ttsfrd
if self.use_ttsfrd:
self.frd = ttsfrd.TtsFrontendEngine()
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
assert self.frd.initialize('{}/CosyVoice-ttsfrd/resource'.format(model_dir)) is True, 'failed to initialize ttsfrd resource'
self.frd.set_lang_type('pinyin')
self.frd.enable_pinyin_mix(True)
self.frd.set_breakmodel_index(1)
else:
self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
self.en_tn_model = EnNormalizer()
def _extract_text_token(self, text):
text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
return text_token, text_token_len
def _extract_speech_token(self, speech):
feat = whisper.log_mel_spectrogram(speech, n_mels=128)
speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
return speech_token, speech_token_len
def _extract_spk_embedding(self, speech):
feat = kaldi.fbank(speech,
num_mel_bins=80,
dither=0,
sample_frequency=16000)
feat = feat - feat.mean(dim=0, keepdim=True)
embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
embedding = torch.tensor([embedding]).to(self.device)
return embedding
def _extract_speech_feat(self, speech):
speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
speech_feat = speech_feat.unsqueeze(dim=0)
speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
return speech_feat, speech_feat_len
def text_normalize(self, text, split=True):
text = text.strip()
if contains_chinese(text):
if self.use_ttsfrd:
text = self.frd.get_frd_extra_info(text, 'input')
else:
text = self.zh_tn_model.normalize(text)
text = text.replace("\n", "")
text = replace_blank(text)
text = replace_corner_mark(text)
text = text.replace(".", "、")
text = text.replace(" - ", ",")
text = remove_bracket(text)
text = re.sub(r'[,,]+$', '。', text)
texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
token_min_n=60, merge_len=20,
comma_split=False)]
else:
if self.use_ttsfrd:
text = self.frd.get_frd_extra_info(text, 'input')
else:
text = self.en_tn_model.normalize(text)
text = spell_out_number(text, self.inflect_parser)
texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
token_min_n=60, merge_len=20,
comma_split=False)]
if split is False:
return text
return texts
def frontend_sft(self, tts_text, spk_id):
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
embedding = self.spk2info[spk_id]['embedding']
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
return model_input
def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
embedding = self._extract_spk_embedding(prompt_speech_16k)
model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
'llm_embedding': embedding, 'flow_embedding': embedding}
return model_input
def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
# in cross lingual mode, we remove prompt in llm
del model_input['prompt_text']
del model_input['prompt_text_len']
del model_input['llm_prompt_speech_token']
del model_input['llm_prompt_speech_token_len']
return model_input
def frontend_instruct(self, tts_text, spk_id, instruct_text):
model_input = self.frontend_sft(tts_text, spk_id)
# in instruct mode, we remove spk_embedding in llm due to information leakage
del model_input['llm_embedding']
instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
model_input['prompt_text'] = instruct_text_token
model_input['prompt_text_len'] = instruct_text_token_len
return model_input
================================================
FILE: cosyvoice/cli/model.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
class CosyVoiceModel:
def __init__(self,
llm: torch.nn.Module,
flow: torch.nn.Module,
hift: torch.nn.Module):
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.llm = llm
self.flow = flow
self.hift = hift
def load(self, llm_model, flow_model, hift_model):
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
self.llm.to(self.device).eval()
self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
self.flow.to(self.device).eval()
self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
self.hift.to(self.device).eval()
def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
tts_speech_token = self.llm.inference(text=text.to(self.device),
text_len=text_len.to(self.device),
prompt_text=prompt_text.to(self.device),
prompt_text_len=prompt_text_len.to(self.device),
prompt_speech_token=llm_prompt_speech_token.to(self.device),
prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
embedding=llm_embedding.to(self.device),
beam_size=1,
sampling=25,
max_token_text_ratio=30,
min_token_text_ratio=3)
tts_mel = self.flow.inference(token=tts_speech_token,
token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
prompt_token=flow_prompt_speech_token.to(self.device),
prompt_token_len=flow_prompt_speech_token_len.to(self.device),
prompt_feat=prompt_speech_feat.to(self.device),
prompt_feat_len=prompt_speech_feat_len.to(self.device),
embedding=flow_embedding.to(self.device))
tts_speech = self.hift.inference(mel=tts_mel).cpu()
torch.cuda.empty_cache()
return {'tts_speech': tts_speech}
================================================
FILE: cosyvoice/dataset/__init__.py
================================================
================================================
FILE: cosyvoice/dataset/dataset.py
================================================
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
# 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import json
import math
from functools import partial
import torch
import torch.distributed as dist
from torch.utils.data import IterableDataset
from cosyvoice.utils.file_utils import read_lists, read_json_lists
class Processor(IterableDataset):
def __init__(self, source, f, *args, **kw):
assert callable(f)
self.source = source
self.f = f
self.args = args
self.kw = kw
def set_epoch(self, epoch):
self.source.set_epoch(epoch)
def __iter__(self):
""" Return an iterator over the source dataset processed by the
given processor.
"""
assert self.source is not None
assert callable(self.f)
return self.f(iter(self.source), *self.args, **self.kw)
def apply(self, f):
assert callable(f)
return Processor(self, f, *self.args, **self.kw)
class DistributedSampler:
def __init__(self, shuffle=True, partition=True):
self.epoch = -1
self.update()
self.shuffle = shuffle
self.partition = partition
def update(self):
assert dist.is_available()
if dist.is_initialized():
self.rank = dist.get_rank()
self.world_size = dist.get_world_size()
else:
self.rank = 0
self.world_size = 1
worker_info = torch.utils.data.get_worker_info()
if worker_info is None:
self.worker_id = 0
self.num_workers = 1
else:
self.worker_id = worker_info.id
self.num_workers = worker_info.num_workers
return dict(rank=self.rank,
world_size=self.world_size,
worker_id=self.worker_id,
num_workers=self.num_workers)
def set_epoch(self, epoch):
self.epoch = epoch
def sample(self, data):
""" Sample data according to rank/world_size/num_workers
Args:
data(List): input data list
Returns:
List: data list after sample
"""
data = list(range(len(data)))
# force datalist even
if self.partition:
if self.shuffle:
random.Random(self.epoch).shuffle(data)
if len(data) < self.world_size:
data = data * math.ceil(self.world_size / len(data))
data = data[:self.world_size]
data = data[self.rank::self.world_size]
if len(data) < self.num_workers:
data = data * math.ceil(self.num_workers / len(data))
data = data[:self.num_workers]
data = data[self.worker_id::self.num_workers]
return data
class DataList(IterableDataset):
def __init__(self, lists, shuffle=True, partition=True):
self.lists = lists
self.sampler = DistributedSampler(shuffle, partition)
def set_epoch(self, epoch):
self.sampler.set_epoch(epoch)
def __iter__(self):
sampler_info = self.sampler.update()
indexes = self.sampler.sample(self.lists)
for index in indexes:
data = dict(src=self.lists[index])
data.update(sampler_info)
yield data
def Dataset(data_list_file,
data_pipeline,
mode='train',
shuffle=True,
partition=True,
tts_file='',
prompt_utt2data=''):
""" Construct dataset from arguments
We have two shuffle stage in the Dataset. The first is global
shuffle at shards tar/raw file level. The second is global shuffle
at training samples level.
Args:
data_type(str): raw/shard
tokenizer (BaseTokenizer): tokenizer to tokenize
partition(bool): whether to do data partition in terms of rank
"""
assert mode in ['train', 'inference']
lists = read_lists(data_list_file)
if mode == 'inference':
with open(tts_file) as f:
tts_data = json.load(f)
utt2lists = read_json_lists(prompt_utt2data)
# filter unnecessary file in inference mode
lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
dataset = DataList(lists,
shuffle=shuffle,
partition=partition)
if mode == 'inference':
# map partial arg tts_data in inference mode
data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
for func in data_pipeline:
dataset = Processor(dataset, func, mode=mode)
return dataset
================================================
FILE: cosyvoice/dataset/processor.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
import pyarrow.parquet as pq
from io import BytesIO
import torch
import torchaudio
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
torchaudio.set_audio_backend('soundfile')
AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
def parquet_opener(data, mode='train', tts_data={}):
""" Give url or local file, return file descriptor
Inplace operation.
Args:
data(Iterable[str]): url or local file list
Returns:
Iterable[{src, stream}]
"""
for sample in data:
assert 'src' in sample
url = sample['src']
try:
df = pq.read_table(url).to_pandas()
for i in range(len(df)):
if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
continue
sample.update(dict(df.loc[i]))
if mode == 'train':
# NOTE do not return sample directly, must initialize a new dict
yield {**sample}
else:
for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
yield {**sample, 'tts_index': index, 'tts_text': text}
except Exception as ex:
logging.warning('Failed to open {}, ex info {}'.format(url, ex))
def filter(data,
max_length=10240,
min_length=10,
token_max_length=200,
token_min_length=1,
min_output_input_ratio=0.0005,
max_output_input_ratio=1,
mode='train'):
""" Filter sample according to feature and label length
Inplace operation.
Args::
data: Iterable[{key, wav, label, sample_rate}]
max_length: drop utterance which is greater than max_length(10ms)
min_length: drop utterance which is less than min_length(10ms)
token_max_length: drop utterance which is greater than
token_max_length, especially when use char unit for
english modeling
token_min_length: drop utterance which is
less than token_max_length
min_output_input_ratio: minimal ration of
token_length / feats_length(10ms)
max_output_input_ratio: maximum ration of
token_length / feats_length(10ms)
Returns:
Iterable[{key, wav, label, sample_rate}]
"""
for sample in data:
sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
del sample['audio_data']
# sample['wav'] is torch.Tensor, we have 100 frames every second
num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
if num_frames < min_length:
continue
if num_frames > max_length:
continue
if len(sample['text_token']) < token_min_length:
continue
if len(sample['text_token']) > token_max_length:
continue
if len(sample['speech_token']) == 0:
continue
if num_frames != 0:
if len(sample['text_token']) / num_frames < min_output_input_ratio:
continue
if len(sample['text_token']) / num_frames > max_output_input_ratio:
continue
yield sample
def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
""" Resample data.
Inplace operation.
Args:
data: Iterable[{key, wav, label, sample_rate}]
resample_rate: target resample rate
Returns:
Iterable[{key, wav, label, sample_rate}]
"""
for sample in data:
assert 'sample_rate' in sample
assert 'speech' in sample
sample_rate = sample['sample_rate']
waveform = sample['speech']
if sample_rate != resample_rate:
if sample_rate < min_sample_rate:
continue
sample['sample_rate'] = resample_rate
sample['speech'] = torchaudio.transforms.Resample(
orig_freq=sample_rate, new_freq=resample_rate)(waveform)
max_val = sample['speech'].abs().max()
if max_val > 1:
sample['speech'] /= max_val
yield sample
def compute_fbank(data,
feat_extractor,
mode='train'):
""" Extract fbank
Args:
data: Iterable[{key, wav, label, sample_rate}]
Returns:
Iterable[{key, feat, label}]
"""
for sample in data:
assert 'sample_rate' in sample
assert 'speech' in sample
assert 'utt' in sample
assert 'text_token' in sample
waveform = sample['speech']
mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
sample['speech_feat'] = mat
del sample['speech']
yield sample
def parse_embedding(data, normalize, mode='train'):
""" Parse utt_embedding/spk_embedding
Args:
data: Iterable[{key, wav, label, sample_rate}]
Returns:
Iterable[{key, feat, label}]
"""
for sample in data:
sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
if normalize:
sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
yield sample
def tokenize(data, get_tokenizer, allowed_special, mode='train'):
""" Decode text to chars or BPE
Inplace operation
Args:
data: Iterable[{key, wav, txt, sample_rate}]
Returns:
Iterable[{key, wav, txt, tokens, label, sample_rate}]
"""
tokenizer = get_tokenizer()
for sample in data:
assert 'text' in sample
sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
if mode == 'inference':
sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
yield sample
def shuffle(data, shuffle_size=10000, mode='train'):
""" Local shuffle the data
Args:
data: Iterable[{key, feat, label}]
shuffle_size: buffer size for shuffle
Returns:
Iterable[{key, feat, label}]
"""
buf = []
for sample in data:
buf.append(sample)
if len(buf) >= shuffle_size:
random.shuffle(buf)
for x in buf:
yield x
buf = []
# The sample left over
random.shuffle(buf)
for x in buf:
yield x
def sort(data, sort_size=500, mode='train'):
""" Sort the data by feature length.
Sort is used after shuffle and before batch, so we can group
utts with similar lengths into a batch, and `sort_size` should
be less than `shuffle_size`
Args:
data: Iterable[{key, feat, label}]
sort_size: buffer size for sort
Returns:
Iterable[{key, feat, label}]
"""
buf = []
for sample in data:
buf.append(sample)
if len(buf) >= sort_size:
buf.sort(key=lambda x: x['speech_feat'].size(0))
for x in buf:
yield x
buf = []
# The sample left over
buf.sort(key=lambda x: x['speech_feat'].size(0))
for x in buf:
yield x
def static_batch(data, batch_size=16):
""" Static batch the data by `batch_size`
Args:
data: Iterable[{key, feat, label}]
batch_size: batch size
Returns:
Iterable[List[{key, feat, label}]]
"""
buf = []
for sample in data:
buf.append(sample)
if len(buf) >= batch_size:
yield buf
buf = []
if len(buf) > 0:
yield buf
def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
""" Dynamic batch the data until the total frames in batch
reach `max_frames_in_batch`
Args:
data: Iterable[{key, feat, label}]
max_frames_in_batch: max_frames in one batch
Returns:
Iterable[List[{key, feat, label}]]
"""
buf = []
longest_frames = 0
for sample in data:
assert 'speech_feat' in sample
assert isinstance(sample['speech_feat'], torch.Tensor)
new_sample_frames = sample['speech_feat'].size(0)
longest_frames = max(longest_frames, new_sample_frames)
frames_after_padding = longest_frames * (len(buf) + 1)
if frames_after_padding > max_frames_in_batch:
yield buf
buf = [sample]
longest_frames = new_sample_frames
else:
buf.append(sample)
if len(buf) > 0:
yield buf
def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
""" Wrapper for static/dynamic batch
"""
if mode == 'inference':
return static_batch(data, 1)
else:
if batch_type == 'static':
return static_batch(data, batch_size)
elif batch_type == 'dynamic':
return dynamic_batch(data, max_frames_in_batch)
else:
logging.fatal('Unsupported batch type {}'.format(batch_type))
def padding(data, use_spk_embedding, mode='train'):
""" Padding the data into training data
Args:
data: Iterable[List[{key, feat, label}]]
Returns:
Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
"""
for sample in data:
assert isinstance(sample, list)
speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
dtype=torch.int32)
order = torch.argsort(speech_feat_len, descending=True)
utts = [sample[i]['utt'] for i in order]
speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
speech_token = pad_sequence(speech_token,
batch_first=True,
padding_value=0)
speech_feat = [sample[i]['speech_feat'] for i in order]
speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
speech_feat = pad_sequence(speech_feat,
batch_first=True,
padding_value=0)
text = [sample[i]['text'] for i in order]
text_token = [torch.tensor(sample[i]['text_token']) for i in order]
text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
batch = {
"utts": utts,
"speech_token": speech_token,
"speech_token_len": speech_token_len,
"speech_feat": speech_feat,
"speech_feat_len": speech_feat_len,
"text": text,
"text_token": text_token,
"text_token_len": text_token_len,
"utt_embedding": utt_embedding,
"spk_embedding": spk_embedding,
}
if mode == 'inference':
tts_text = [sample[i]['tts_text'] for i in order]
tts_index = [sample[i]['tts_index'] for i in order]
tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
batch.update({'tts_text': tts_text,
'tts_index': tts_index,
'tts_text_token': tts_text_token,
'tts_text_token_len': tts_text_token_len})
if use_spk_embedding is True:
batch["embedding"] = batch["spk_embedding"]
else:
batch["embedding"] = batch["utt_embedding"]
yield batch
================================================
FILE: cosyvoice/flow/decoder.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from einops import pack, rearrange, repeat
from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
from matcha.models.components.transformer import BasicTransformerBlock
class ConditionalDecoder(nn.Module):
def __init__(
self,
in_channels,
out_channels,
channels=(256, 256),
dropout=0.05,
attention_head_dim=64,
n_blocks=1,
num_mid_blocks=2,
num_heads=4,
act_fn="snake",
):
"""
This decoder requires an input with the same shape of the target. So, if your text content
is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
"""
super().__init__()
channels = tuple(channels)
self.in_channels = in_channels
self.out_channels = out_channels
self.time_embeddings = SinusoidalPosEmb(in_channels)
time_embed_dim = channels[0] * 4
self.time_mlp = TimestepEmbedding(
in_channels=in_channels,
time_embed_dim=time_embed_dim,
act_fn="silu",
)
self.down_blocks = nn.ModuleList([])
self.mid_blocks = nn.ModuleList([])
self.up_blocks = nn.ModuleList([])
output_channel = in_channels
for i in range(len(channels)): # pylint: disable=consider-using-enumerate
input_channel = output_channel
output_channel = channels[i]
is_last = i == len(channels) - 1
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
downsample = (
Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
for i in range(num_mid_blocks):
input_channel = channels[-1]
out_channels = channels[-1]
resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
channels = channels[::-1] + (channels[0],)
for i in range(len(channels) - 1):
input_channel = channels[i] * 2
output_channel = channels[i + 1]
is_last = i == len(channels) - 2
resnet = ResnetBlock1D(
dim=input_channel,
dim_out=output_channel,
time_emb_dim=time_embed_dim,
)
transformer_blocks = nn.ModuleList(
[
BasicTransformerBlock(
dim=output_channel,
num_attention_heads=num_heads,
attention_head_dim=attention_head_dim,
dropout=dropout,
activation_fn=act_fn,
)
for _ in range(n_blocks)
]
)
upsample = (
Upsample1D(output_channel, use_conv_transpose=True)
if not is_last
else nn.Conv1d(output_channel, output_channel, 3, padding=1)
)
self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
self.final_block = Block1D(channels[-1], channels[-1])
self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
self.initialize_weights()
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv1d):
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.GroupNorm):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x, mask, mu, t, spks=None, cond=None):
"""Forward pass of the UNet1DConditional model.
Args:
x (torch.Tensor): shape (batch_size, in_channels, time)
mask (_type_): shape (batch_size, 1, time)
t (_type_): shape (batch_size)
spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
cond (_type_, optional): placeholder for future use. Defaults to None.
Raises:
ValueError: _description_
ValueError: _description_
Returns:
_type_: _description_
"""
t = self.time_embeddings(t)
t = self.time_mlp(t)
x = pack([x, mu], "b * t")[0]
if spks is not None:
spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
x = pack([x, spks], "b * t")[0]
if cond is not None:
x = pack([x, cond], "b * t")[0]
hiddens = []
masks = [mask]
for resnet, transformer_blocks, downsample in self.down_blocks:
mask_down = masks[-1]
x = resnet(x, mask_down, t)
x = rearrange(x, "b c t -> b t c").contiguous()
attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
hiddens.append(x) # Save hidden states for skip connections
x = downsample(x * mask_down)
masks.append(mask_down[:, :, ::2])
masks = masks[:-1]
mask_mid = masks[-1]
for resnet, transformer_blocks in self.mid_blocks:
x = resnet(x, mask_mid, t)
x = rearrange(x, "b c t -> b t c").contiguous()
attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
for resnet, transformer_blocks, upsample in self.up_blocks:
mask_up = masks.pop()
skip = hiddens.pop()
x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
x = resnet(x, mask_up, t)
x = rearrange(x, "b c t -> b t c").contiguous()
attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
for transformer_block in transformer_blocks:
x = transformer_block(
hidden_states=x,
attention_mask=attn_mask,
timestep=t,
)
x = rearrange(x, "b t c -> b c t").contiguous()
x = upsample(x * mask_up)
x = self.final_block(x, mask_up)
output = self.final_proj(x * mask_up)
return output * mask
================================================
FILE: cosyvoice/flow/flow.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
from typing import Dict, Optional
import torch
import torch.nn as nn
from torch.nn import functional as F
from omegaconf import DictConfig
from cosyvoice.utils.mask import make_pad_mask
class MaskedDiffWithXvec(torch.nn.Module):
def __init__(self,
input_size: int = 512,
output_size: int = 80,
spk_embed_dim: int = 192,
output_type: str = "mel",
vocab_size: int = 4096,
input_frame_rate: int = 50,
only_mask_loss: bool = True,
encoder: torch.nn.Module = None,
length_regulator: torch.nn.Module = None,
decoder: torch.nn.Module = None,
decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
super().__init__()
self.input_size = input_size
self.output_size = output_size
self.decoder_conf = decoder_conf
self.mel_feat_conf = mel_feat_conf
self.vocab_size = vocab_size
self.output_type = output_type
self.input_frame_rate = input_frame_rate
logging.info(f"input frame rate={self.input_frame_rate}")
self.input_embedding = nn.Embedding(vocab_size, input_size)
self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
self.encoder = encoder
self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
self.decoder = decoder
self.length_regulator = length_regulator
self.only_mask_loss = only_mask_loss
def forward(
self,
batch: dict,
device: torch.device,
) -> Dict[str, Optional[torch.Tensor]]:
token = batch['speech_token'].to(device)
token_len = batch['speech_token_len'].to(device)
feat = batch['speech_feat'].to(device)
feat_len = batch['speech_feat_len'].to(device)
embedding = batch['embedding'].to(device)
# xvec projection
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
# concat text and prompt_text
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
token = self.input_embedding(torch.clamp(token, min=0)) * mask
# text encode
h, h_lengths = self.encoder(token, token_len)
h = self.encoder_proj(h)
h, h_lengths = self.length_regulator(h, feat_len)
# get conditions
conds = torch.zeros(feat.shape, device=token.device)
for i, j in enumerate(feat_len):
if random.random() < 0.5:
continue
index = random.randint(0, int(0.3 * j))
conds[i, :index] = feat[i, :index]
conds = conds.transpose(1, 2)
mask = (~make_pad_mask(feat_len)).to(h)
feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
loss, _ = self.decoder.compute_loss(
feat.transpose(1, 2).contiguous(),
mask.unsqueeze(1),
h.transpose(1, 2).contiguous(),
embedding,
cond=conds
)
return {'loss': loss}
@torch.inference_mode()
def inference(self,
token,
token_len,
prompt_token,
prompt_token_len,
prompt_feat,
prompt_feat_len,
embedding):
assert token.shape[0] == 1
# xvec projection
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
# concat text and prompt_text
token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
token = self.input_embedding(torch.clamp(token, min=0)) * mask
# text encode
h, h_lengths = self.encoder(token, token_len)
h = self.encoder_proj(h)
feat_len = (token_len / 50 * 22050 / 256).int()
h, h_lengths = self.length_regulator(h, feat_len)
# get conditions
conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
if prompt_feat.shape[1] != 0:
for i, j in enumerate(prompt_feat_len):
conds[i, :j] = prompt_feat[i]
conds = conds.transpose(1, 2)
mask = (~make_pad_mask(feat_len)).to(h)
feat = self.decoder(
mu=h.transpose(1, 2).contiguous(),
mask=mask.unsqueeze(1),
spks=embedding,
cond=conds,
n_timesteps=10
)
if prompt_feat.shape[1] != 0:
feat = feat[:, :, prompt_feat.shape[1]:]
return feat
================================================
FILE: cosyvoice/flow/flow_matching.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM
class ConditionalCFM(BASECFM):
def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
super().__init__(
n_feats=in_channels,
cfm_params=cfm_params,
n_spks=n_spks,
spk_emb_dim=spk_emb_dim,
)
self.t_scheduler = cfm_params.t_scheduler
self.training_cfg_rate = cfm_params.training_cfg_rate
self.inference_cfg_rate = cfm_params.inference_cfg_rate
in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
# Just change the architecture of the estimator here
self.estimator = estimator
@torch.inference_mode()
def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
"""Forward diffusion
Args:
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): output_mask
shape: (batch_size, 1, mel_timesteps)
n_timesteps (int): number of diffusion steps
temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
spks (torch.Tensor, optional): speaker ids. Defaults to None.
shape: (batch_size, spk_emb_dim)
cond: Not used but kept for future purposes
Returns:
sample: generated mel-spectrogram
shape: (batch_size, n_feats, mel_timesteps)
"""
z = torch.randn_like(mu) * temperature
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
if self.t_scheduler == 'cosine':
t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
def solve_euler(self, x, t_span, mu, mask, spks, cond):
"""
Fixed euler solver for ODEs.
Args:
x (torch.Tensor): random noise
t_span (torch.Tensor): n_timesteps interpolated
shape: (n_timesteps + 1,)
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): output_mask
shape: (batch_size, 1, mel_timesteps)
spks (torch.Tensor, optional): speaker ids. Defaults to None.
shape: (batch_size, spk_emb_dim)
cond: Not used but kept for future purposes
"""
t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
# I am storing this because I can later plot it by putting a debugger here and saving it to a file
# Or in future might add like a return_all_steps flag
sol = []
for step in range(1, len(t_span)):
dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
# Classifier-Free Guidance inference introduced in VoiceBox
if self.inference_cfg_rate > 0:
cfg_dphi_dt = self.estimator(
x, mask,
torch.zeros_like(mu), t,
torch.zeros_like(spks) if spks is not None else None,
torch.zeros_like(cond)
)
dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
self.inference_cfg_rate * cfg_dphi_dt)
x = x + dt * dphi_dt
t = t + dt
sol.append(x)
if step < len(t_span) - 1:
dt = t_span[step + 1] - t
return sol[-1]
def compute_loss(self, x1, mask, mu, spks=None, cond=None):
"""Computes diffusion loss
Args:
x1 (torch.Tensor): Target
shape: (batch_size, n_feats, mel_timesteps)
mask (torch.Tensor): target mask
shape: (batch_size, 1, mel_timesteps)
mu (torch.Tensor): output of encoder
shape: (batch_size, n_feats, mel_timesteps)
spks (torch.Tensor, optional): speaker embedding. Defaults to None.
shape: (batch_size, spk_emb_dim)
Returns:
loss: conditional flow matching loss
y: conditional flow
shape: (batch_size, n_feats, mel_timesteps)
"""
b, _, t = mu.shape
# random timestep
t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
if self.t_scheduler == 'cosine':
t = 1 - torch.cos(t * 0.5 * torch.pi)
# sample noise p(x_0)
z = torch.randn_like(x1)
y = (1 - (1 - self.sigma_min) * t) * z + t * x1
u = x1 - (1 - self.sigma_min) * z
# during training, we randomly drop condition to trade off mode coverage and sample fidelity
if self.training_cfg_rate > 0:
cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
mu = mu * cfg_mask.view(-1, 1, 1)
spks = spks * cfg_mask.view(-1, 1)
cond = cond * cfg_mask.view(-1, 1, 1)
pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
return loss, y
================================================
FILE: cosyvoice/flow/length_regulator.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Tuple
import torch.nn as nn
from torch.nn import functional as F
from cosyvoice.utils.mask import make_pad_mask
class InterpolateRegulator(nn.Module):
def __init__(
self,
channels: int,
sampling_ratios: Tuple,
out_channels: int = None,
groups: int = 1,
):
super().__init__()
self.sampling_ratios = sampling_ratios
out_channels = out_channels or channels
model = nn.ModuleList([])
if len(sampling_ratios) > 0:
for _ in sampling_ratios:
module = nn.Conv1d(channels, channels, 3, 1, 1)
norm = nn.GroupNorm(groups, channels)
act = nn.Mish()
model.extend([module, norm, act])
model.append(
nn.Conv1d(channels, out_channels, 1, 1)
)
self.model = nn.Sequential(*model)
def forward(self, x, ylens=None):
# x in (B, T, D)
mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
out = self.model(x).transpose(1, 2).contiguous()
olens = ylens
return out * mask, olens
================================================
FILE: cosyvoice/hifigan/f0_predictor.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
class ConvRNNF0Predictor(nn.Module):
def __init__(self,
num_class: int = 1,
in_channels: int = 80,
cond_channels: int = 512
):
super().__init__()
self.num_class = num_class
self.condnet = nn.Sequential(
weight_norm(
nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
),
nn.ELU(),
weight_norm(
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
),
nn.ELU(),
weight_norm(
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
),
nn.ELU(),
weight_norm(
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
),
nn.ELU(),
weight_norm(
nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
),
nn.ELU(),
)
self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.condnet(x)
x = x.transpose(1, 2)
return torch.abs(self.classifier(x).squeeze(-1))
================================================
FILE: cosyvoice/hifigan/generator.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HIFI-GAN"""
import typing as tp
import numpy as np
from scipy.signal import get_window
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv1d
from torch.nn import ConvTranspose1d
from torch.nn.utils import remove_weight_norm
from torch.nn.utils import weight_norm
from torch.distributions.uniform import Uniform
from cosyvoice.transformer.activation import Snake
from cosyvoice.utils.common import get_padding
from cosyvoice.utils.common import init_weights
"""hifigan based generator implementation.
This code is modified from https://github.com/jik876/hifi-gan
,https://github.com/kan-bayashi/ParallelWaveGAN and
https://github.com/NVIDIA/BigVGAN
"""
class ResBlock(torch.nn.Module):
"""Residual block module in HiFiGAN/BigVGAN."""
def __init__(
self,
channels: int = 512,
kernel_size: int = 3,
dilations: tp.List[int] = [1, 3, 5],
):
super(ResBlock, self).__init__()
self.convs1 = nn.ModuleList()
self.convs2 = nn.ModuleList()
for dilation in dilations:
self.convs1.append(
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=dilation,
padding=get_padding(kernel_size, dilation)
)
)
)
self.convs2.append(
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
1,
dilation=1,
padding=get_padding(kernel_size, 1)
)
)
)
self.convs1.apply(init_weights)
self.convs2.apply(init_weights)
self.activations1 = nn.ModuleList([
Snake(channels, alpha_logscale=False)
for _ in range(len(self.convs1))
])
self.activations2 = nn.ModuleList([
Snake(channels, alpha_logscale=False)
for _ in range(len(self.convs2))
])
def forward(self, x: torch.Tensor) -> torch.Tensor:
for idx in range(len(self.convs1)):
xt = self.activations1[idx](x)
xt = self.convs1[idx](xt)
xt = self.activations2[idx](xt)
xt = self.convs2[idx](xt)
x = xt + x
return x
def remove_weight_norm(self):
for idx in range(len(self.convs1)):
remove_weight_norm(self.convs1[idx])
remove_weight_norm(self.convs2[idx])
class SineGen(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
# generate uv signal
uv = (f0 > self.voiced_threshold).type(torch.float32)
return uv
@torch.no_grad()
def forward(self, f0):
"""
:param f0: [B, 1, sample_len], Hz
:return: [B, 1, sample_len]
"""
F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
for i in range(self.harmonic_num + 1):
F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
u_dist = Uniform(low=-np.pi, high=np.pi)
phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
phase_vec[:, 0, :] = 0
# generate sine waveforms
sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
# generate uv signal
uv = self._f02uv(f0)
# noise: for unvoiced should be similar to sine_amp
# std = self.sine_amp/3 -> max value ~ self.sine_amp
# . for voiced regions is self.noise_std
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
# first: set the unvoiced part to 0 by uv
# then: additive noise
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x):
"""
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
"""
# source for harmonic branch
with torch.no_grad():
sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
sine_wavs = sine_wavs.transpose(1, 2)
uv = uv.transpose(1, 2)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
# source for noise branch, in the same shape as uv
noise = torch.randn_like(uv) * self.sine_amp / 3
return sine_merge, noise, uv
class HiFTGenerator(nn.Module):
"""
HiFTNet Generator: Neural Source Filter + ISTFTNet
https://arxiv.org/abs/2309.09493
"""
def __init__(
self,
in_channels: int = 80,
base_channels: int = 512,
nb_harmonics: int = 8,
sampling_rate: int = 22050,
nsf_alpha: float = 0.1,
nsf_sigma: float = 0.003,
nsf_voiced_threshold: float = 10,
upsample_rates: tp.List[int] = [8, 8],
upsample_kernel_sizes: tp.List[int] = [16, 16],
istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4},
resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
source_resblock_kernel_sizes: tp.List[int] = [7, 11],
source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]],
lrelu_slope: float = 0.1,
audio_limit: float = 0.99,
f0_predictor: torch.nn.Module = None,
):
super(HiFTGenerator, self).__init__()
self.out_channels = 1
self.nb_harmonics = nb_harmonics
self.sampling_rate = sampling_rate
self.istft_params = istft_params
self.lrelu_slope = lrelu_slope
self.audio_limit = audio_limit
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.m_source = SourceModuleHnNSF(
sampling_rate=sampling_rate,
upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
harmonic_num=nb_harmonics,
sine_amp=nsf_alpha,
add_noise_std=nsf_sigma,
voiced_threshod=nsf_voiced_threshold)
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
self.conv_pre = weight_norm(
Conv1d(in_channels, base_channels, 7, 1, padding=3)
)
# Up
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(
weight_norm(
ConvTranspose1d(
base_channels // (2**i),
base_channels // (2**(i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
# Down
self.source_downs = nn.ModuleList()
self.source_resblocks = nn.ModuleList()
downsample_rates = [1] + upsample_rates[::-1][:-1]
downsample_cum_rates = np.cumprod(downsample_rates)
for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes,
source_resblock_dilation_sizes)):
if u == 1:
self.source_downs.append(
Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
)
else:
self.source_downs.append(
Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
)
self.source_resblocks.append(
ResBlock(base_channels // (2 ** (i + 1)), k, d)
)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = base_channels // (2**(i + 1))
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(ResBlock(ch, k, d))
self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
self.reflection_pad = nn.ReflectionPad1d((1, 0))
self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
self.f0_predictor = f0_predictor
def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
har_source, _, _ = self.m_source(f0)
return har_source.transpose(1, 2)
def _stft(self, x):
spec = torch.stft(
x,
self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
return_complex=True)
spec = torch.view_as_real(spec) # [B, F, TT, 2]
return spec[..., 0], spec[..., 1]
def _istft(self, magnitude, phase):
magnitude = torch.clip(magnitude, max=1e2)
real = magnitude * torch.cos(phase)
img = magnitude * torch.sin(phase)
inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
return inverse_transform
def forward(self, x: torch.Tensor) -> torch.Tensor:
f0 = self.f0_predictor(x)
s = self._f02source(f0)
s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, self.lrelu_slope)
x = self.ups[i](x)
if i == self.num_upsamples - 1:
x = self.reflection_pad(x)
# fusion
si = self.source_downs[i](s_stft)
si = self.source_resblocks[i](si)
x = x + si
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :]) # actually, sin is redundancy
x = self._istft(magnitude, phase)
x = torch.clamp(x, -self.audio_limit, self.audio_limit)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
self.source_module.remove_weight_norm()
for l in self.source_downs:
remove_weight_norm(l)
for l in self.source_resblocks:
l.remove_weight_norm()
@torch.inference_mode()
def inference(self, mel: torch.Tensor) -> torch.Tensor:
return self.forward(x=mel)
================================================
FILE: cosyvoice/llm/llm.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Optional, Union
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, unpad_sequence
from cosyvoice.utils.common import IGNORE_ID
from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
from cosyvoice.utils.common import th_accuracy
class TransformerLM(torch.nn.Module):
def __init__(
self,
text_encoder_input_size: int,
llm_input_size: int,
llm_output_size: int,
text_token_size: int,
speech_token_size: int,
text_encoder: torch.nn.Module,
llm: torch.nn.Module,
length_normalized_loss: bool = True,
lsm_weight: float = 0.0,
spk_embed_dim: int = 192,
):
super().__init__()
self.llm_input_size = llm_input_size
self.speech_token_size = speech_token_size
# 1. build text token inputs related modules
self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
self.text_encoder = text_encoder
self.text_encoder_affine_layer = nn.Linear(
self.text_encoder.output_size(),
llm_input_size
)
# 2. build speech token language model related modules
self.sos_eos = 0
self.task_id = 1
self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
self.llm = llm
self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 1)
self.criterion_ce = LabelSmoothingLoss(
size=speech_token_size + 1,
padding_idx=IGNORE_ID,
smoothing=lsm_weight,
normalize_length=length_normalized_loss,
)
# 3. [Optional] build speech token related modules
self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size)
self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size)
def encode(
self,
text: torch.Tensor,
text_lengths: torch.Tensor,
):
encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
encoder_out_lens = encoder_mask.squeeze(1).sum(1)
encoder_out = self.text_encoder_affine_layer(encoder_out)
return encoder_out, encoder_out_lens
def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len):
text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0) for i in range(len(text_token))]
lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
return lm_input, lm_input_len
def forward(
self,
batch: dict,
device: torch.device,
) -> Dict[str, Optional[torch.Tensor]]:
"""
Args:
text: (B, L, D)
text_lengths: (B,)
audio: (B, T, N) or (B, T)
audio_lengths: (B,)
"""
text_token = batch['text_token'].to(device)
text_token_len = batch['text_token_len'].to(device)
speech_token = batch['speech_token'].to(device)
speech_token_len = batch['speech_token_len'].to(device)
embedding = batch['embedding'].to(device)
# 1. prepare llm_target
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
# 1. encode text_token
text_token = self.text_embedding(text_token)
text_token, text_token_len = self.encode(text_token, text_token_len)
# 2. embedding projection
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
embedding = embedding.unsqueeze(1)
# 3. eos and task_id
sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
# 4. encode speech_token
speech_token = self.speech_embedding(speech_token)
# 5. unpad and pad
lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len)
# 6. run lm forward
lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
logits = self.llm_decoder(lm_output)
loss = self.criterion_ce(logits, lm_target)
acc = th_accuracy(logits.view(-1, self.speech_token_size + 1), lm_target, ignore_label=IGNORE_ID)
return {'loss': loss, 'acc': acc}
def sampling_ids(
self,
weighted_scores: torch.Tensor,
sampling: Union[bool, int, float] = True,
beam_size: int = 1,
ignore_eos: bool = True,
):
while True:
prob, indices = weighted_scores.softmax(dim=-1).topk(sampling)
top_ids = prob.multinomial(beam_size, replacement=True)
top_ids = indices[top_ids]
if (not ignore_eos) or (self.speech_token_size not in top_ids):
break
return top_ids
@torch.inference_mode()
def inference(
self,
text: torch.Tensor,
text_len: torch.Tensor,
prompt_text: torch.Tensor,
prompt_text_len: torch.Tensor,
prompt_speech_token: torch.Tensor,
prompt_speech_token_len: torch.Tensor,
embedding: torch.Tensor,
beam_size: int = 1,
sampling: int = 25,
max_token_text_ratio: float = 20,
min_token_text_ratio: float = 2,
) -> torch.Tensor:
device = text.device
text = torch.concat([prompt_text, text], dim=1)
text_len += prompt_text_len
text = self.text_embedding(text)
# 1. encode text
text, text_len = self.encode(text, text_len)
# 2. encode embedding
if embedding.shape[0] != 0:
embedding = F.normalize(embedding, dim=1)
embedding = self.spk_embed_affine_layer(embedding)
embedding = embedding.unsqueeze(dim=1)
else:
embedding = torch.zeros(1, 0, self.llm_input_size).to(device)
# 3. concat llm_input
sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
if prompt_speech_token_len != 0:
prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
else:
prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size).to(device)
lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)
# 4. cal min/max_length
min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
max_len = int((text_len - prompt_text_len) * max_token_text_ratio)
# 5. step by step decode
out_tokens = []
offset = 0
att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device)
for i in range(max_len):
y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache,
att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool))
logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
top_ids = self.sampling_ids(logp.squeeze(dim=0), sampling, beam_size, ignore_eos=True if i < min_len else False).item()
if top_ids == self.speech_token_size:
break
out_tokens.append(top_ids)
offset += lm_input.size(1)
lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
return torch.tensor([out_tokens], dtype=torch.int64, device=device)
================================================
FILE: cosyvoice/transformer/__init__.py
================================================
================================================
FILE: cosyvoice/transformer/activation.py
================================================
# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
# 2020 Northwestern Polytechnical University (Pengcheng Guo)
# 2020 Mobvoi Inc (Binbin Zhang)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Swish() activation function for Conformer."""
import torch
from torch import nn, sin, pow
from torch.nn import Parameter
class Swish(torch.nn.Module):
"""Construct an Swish object."""
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Return Swish activation function."""
return x * torch.sigmoid(x)
# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
# LICENSE is in incl_licenses directory.
class Snake(nn.Module):
'''
Implementation of a sine-based periodic activation function
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter
References:
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snake(256)
>>> x = torch.randn(256)
>>> x = a1(x)
'''
def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
'''
Initialization.
INPUT:
- in_features: shape of the input
- alpha: trainable parameter
alpha is initialized to 1 by default, higher values = higher-frequency.
alpha will be trained along with the rest of your model.
'''
super(Snake, self).__init__()
self.in_features = in_features
# initialize alpha
self.alpha_logscale = alpha_logscale
if self.alpha_logscale: # log scale alphas initialized to zeros
self.alpha = Parameter(torch.zeros(in_features) * alpha)
else: # linear scale alphas initialized to ones
self.alpha = Parameter(torch.ones(in_features) * alpha)
self.alpha.requires_grad = alpha_trainable
self.no_div_by_zero = 0.000000001
def forward(self, x):
'''
Forward pass of the function.
Applies the function to the input elementwise.
Snake ∶= x + 1/a * sin^2 (xa)
'''
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
if self.alpha_logscale:
alpha = torch.exp(alpha)
x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
return x
================================================
FILE: cosyvoice/transformer/attention.py
================================================
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi-Head Attention layer definition."""
import math
from typing import Tuple
import torch
from torch import nn
class MultiHeadedAttention(nn.Module):
"""Multi-Head Attention layer.
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
"""
def __init__(self,
n_head: int,
n_feat: int,
dropout_rate: float,
key_bias: bool = True):
"""Construct an MultiHeadedAttention object."""
super().__init__()
assert n_feat % n_head == 0
# We assume d_v always equals d_k
self.d_k = n_feat // n_head
self.h = n_head
self.linear_q = nn.Linear(n_feat, n_feat)
self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
self.linear_v = nn.Linear(n_feat, n_feat)
self.linear_out = nn.Linear(n_feat, n_feat)
self.dropout = nn.Dropout(p=dropout_rate)
def forward_qkv(
self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Transform query, key and value.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
Returns:
torch.Tensor: Transformed query tensor, size
(#batch, n_head, time1, d_k).
torch.Tensor: Transformed key tensor, size
(#batch, n_head, time2, d_k).
torch.Tensor: Transformed value tensor, size
(#batch, n_head, time2, d_k).
"""
n_batch = query.size(0)
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
q = q.transpose(1, 2) # (batch, head, time1, d_k)
k = k.transpose(1, 2) # (batch, head, time2, d_k)
v = v.transpose(1, 2) # (batch, head, time2, d_k)
return q, k, v
def forward_attention(
self,
value: torch.Tensor,
scores: torch.Tensor,
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
) -> torch.Tensor:
"""Compute attention context vector.
Args:
value (torch.Tensor): Transformed value, size
(#batch, n_head, time2, d_k).
scores (torch.Tensor): Attention score, size
(#batch, n_head, time1, time2).
mask (torch.Tensor): Mask, size (#batch, 1, time2) or
(#batch, time1, time2), (0, 0, 0) means fake mask.
Returns:
torch.Tensor: Transformed value (#batch, time1, d_model)
weighted by the attention score (#batch, time1, time2).
"""
n_batch = value.size(0)
# NOTE(xcsong): When will `if mask.size(2) > 0` be True?
# 1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
# 1st chunk to ease the onnx export.]
# 2. pytorch training
if mask.size(2) > 0: # time2 > 0
mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
# For last chunk, time2 might be larger than scores.size(-1)
mask = mask[:, :, :, :scores.size(-1)] # (batch, 1, *, time2)
scores = scores.masked_fill(mask, -float('inf'))
attn = torch.softmax(scores, dim=-1).masked_fill(
mask, 0.0) # (batch, head, time1, time2)
# NOTE(xcsong): When will `if mask.size(2) > 0` be False?
# 1. onnx(16/-1, -1/-1, 16/0)
# 2. jit (16/-1, -1/-1, 16/0, 16/4)
else:
attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
p_attn = self.dropout(attn)
x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
self.h * self.d_k)
) # (batch, time1, d_model)
return self.linear_out(x) # (batch, time1, d_model)
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
pos_emb: torch.Tensor = torch.empty(0),
cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute scaled dot product attention.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
(#batch, time1, time2).
1.When applying cross attention between decoder and encoder,
the batch padding mask for input is in (#batch, 1, T) shape.
2.When applying self attention of encoder,
the mask is in (#batch, T, T) shape.
3.When applying self attention of decoder,
the mask is in (#batch, L, L) shape.
4.If the different position in decoder see different block
of the encoder, such as Mocha, the passed in mask could be
in (#batch, L, T) shape. But there is no such case in current
CosyVoice.
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
Returns:
torch.Tensor: Output tensor (#batch, time1, d_model).
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
"""
q, k, v = self.forward_qkv(query, key, value)
# NOTE(xcsong):
# when export onnx model, for 1st chunk, we feed
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
# and we will always do splitting and
# concatnation(this will simplify onnx export). Note that
# it's OK to concat & split zero-shaped tensors(see code below).
# when export jit model, for 1st chunk, we always feed
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
# >>> a = torch.ones((1, 2, 0, 4))
# >>> b = torch.ones((1, 2, 3, 4))
# >>> c = torch.cat((a, b), dim=2)
# >>> torch.equal(b, c) # True
# >>> d = torch.split(a, 2, dim=-1)
# >>> torch.equal(d[0], d[1]) # True
if cache.size(0) > 0:
key_cache, value_cache = torch.split(cache,
cache.size(-1) // 2,
dim=-1)
k = torch.cat([key_cache, k], dim=2)
v = torch.cat([value_cache, v], dim=2)
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
# non-trivial to calculate `next_cache_start` here.
new_cache = torch.cat((k, v), dim=-1)
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
return self.forward_attention(v, scores, mask), new_cache
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Multi-Head Attention layer with relative position encoding.
Paper: https://arxiv.org/abs/1901.02860
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
"""
def __init__(self,
n_head: int,
n_feat: int,
dropout_rate: float,
key_bias: bool = True):
"""Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate, key_bias)
# linear transformation for positional encoding
self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
torch.nn.init.xavier_uniform_(self.pos_bias_u)
torch.nn.init.xavier_uniform_(self.pos_bias_v)
def rel_shift(self, x):
"""Compute relative positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
time1 means the length of query vector.
Returns:
torch.Tensor: Output tensor.
"""
zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
x_padded = torch.cat([zero_pad, x], dim=-1)
x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
x = x_padded[:, :, 1:].view_as(x)[
:, :, :, : x.size(-1) // 2 + 1
] # only keep the positions from 0 to time2
return x
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
pos_emb: torch.Tensor = torch.empty(0),
cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query (torch.Tensor): Query tensor (#batch, time1, size).
key (torch.Tensor): Key tensor (#batch, time2, size).
value (torch.Tensor): Value tensor (#batch, time2, size).
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
(#batch, time1, time2), (0, 0, 0) means fake mask.
pos_emb (torch.Tensor): Positional embedding tensor
(#batch, time2, size).
cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
Returns:
torch.Tensor: Output tensor (#batch, time1, d_model).
torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
where `cache_t == chunk_size * num_decoding_left_chunks`
and `head * d_k == size`
"""
q, k, v = self.forward_qkv(query, key, value)
q = q.transpose(1, 2) # (batch, time1, head, d_k)
# NOTE(xcsong):
# when export onnx model, for 1st chunk, we feed
# cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
# or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
# In all modes, `if cache.size(0) > 0` will alwayse be `True`
# and we will always do splitting and
# concatnation(this will simplify onnx export). Note that
# it's OK to concat & split zero-shaped tensors(see code below).
# when export jit model, for 1st chunk, we always feed
# cache(0, 0, 0, 0) since jit supports dynamic if-branch.
# >>> a = torch.ones((1, 2, 0, 4))
# >>> b = torch.ones((1, 2, 3, 4))
# >>> c = torch.cat((a, b), dim=2)
# >>> torch.equal(b, c) # True
# >>> d = torch.split(a, 2, dim=-1)
# >>> torch.equal(d[0], d[1]) # True
if cache.size(0) > 0:
key_cache, value_cache = torch.split(cache,
cache.size(-1) // 2,
dim=-1)
k = torch.cat([key_cache, k], dim=2)
v = torch.cat([value_cache, v], dim=2)
# NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
# non-trivial to calculate `next_cache_start` here.
new_cache = torch.cat((k, v), dim=-1)
n_batch_pos = pos_emb.size(0)
p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
p = p.transpose(1, 2) # (batch, head, time1, d_k)
# (batch, head, time1, d_k)
q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
# (batch, head, time1, d_k)
q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
# compute attention score
# first compute matrix a and matrix c
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
# (batch, head, time1, time2)
matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
# compute matrix b and matrix d
# (batch, head, time1, time2)
matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
# NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
if matrix_ac.shape != matrix_bd.shape:
matrix_bd = self.rel_shift(matrix_bd)
scores = (matrix_ac + matrix_bd) / math.sqrt(
self.d_k) # (batch, head, time1, time2)
return self.forward_attention(v, scores, mask), new_cache
================================================
FILE: cosyvoice/transformer/convolution.py
================================================
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""ConvolutionModule definition."""
from typing import Tuple
import torch
from torch import nn
class ConvolutionModule(nn.Module):
"""ConvolutionModule in Conformer model."""
def __init__(self,
channels: int,
kernel_size: int = 15,
activation: nn.Module = nn.ReLU(),
norm: str = "batch_norm",
causal: bool = False,
bias: bool = True):
"""Construct an ConvolutionModule object.
Args:
channels (int): The number of channels of conv layers.
kernel_size (int): Kernel size of conv layers.
causal (int): Whether use causal convolution or not
"""
super().__init__()
self.pointwise_conv1 = nn.Conv1d(
channels,
2 * channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias,
)
# self.lorder is used to distinguish if it's a causal convolution,
# if self.lorder > 0: it's a causal convolution, the input will be
# padded with self.lorder frames on the left in forward.
# else: it's a symmetrical convolution
if causal:
padding = 0
self.lorder = kernel_size - 1
else:
# kernel_size should be an odd number for none causal convolution
assert (kernel_size - 1) % 2 == 0
padding = (kernel_size - 1) // 2
self.lorder = 0
self.depthwise_conv = nn.Conv1d(
channels,
channels,
kernel_size,
stride=1,
padding=padding,
groups=channels,
bias=bias,
)
assert norm in ['batch_norm', 'layer_norm']
if norm == "batch_norm":
self.use_layer_norm = False
self.norm = nn.BatchNorm1d(channels)
else:
self.use_layer_norm = True
self.norm = nn.LayerNorm(channels)
self.pointwise_conv2 = nn.Conv1d(
channels,
channels,
kernel_size=1,
stride=1,
padding=0,
bias=bias,
)
self.activation = activation
def forward(
self,
x: torch.Tensor,
mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
cache: torch.Tensor = torch.zeros((0, 0, 0)),
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Compute convolution module.
Args:
x (torch.Tensor): Input tensor (#batch, time, channels).
mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
(0, 0, 0) means fake mask.
cache (torch.Tensor): left context cache, it is only
used in causal convolution (#batch, channels, cache_t),
(0, 0, 0) meas fake cache.
Returns:
torch.Tensor: Output tensor (#batch, time, channels).
"""
# exchange the temporal dimension and the feature dimension
x = x.transpose(1, 2) # (#batch, channels, time)
# mask batch padding
if mask_pad.size(2) > 0: # time > 0
x.masked_fill_(~mask_pad, 0.0)
if self.lorder > 0:
if cache.size(2) == 0: # cache_t == 0
x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
else:
assert cache.size(0) == x.size(0) # equal batch
assert cache.size(1) == x.size(1) # equal channel
x = torch.cat((cache, x), dim=2)
assert (x.size(2) > self.lorder)
new_cache = x[:, :, -self.lorder:]
else:
# It's better we just return None if no cache is required,
# However, for JIT export, here we just fake one tensor instead of
# None.
new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
# GLU mechanism
x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
# 1D Depthwise Conv
x = self.depthwise_conv(x)
if self.use_layer_norm:
x = x.transpose(1, 2)
x = self.activation(self.norm(x))
if self.use_layer_norm:
x = x.transpose(1, 2)
x = self.pointwise_conv2(x)
# mask batch padding
if mask_pad.size(2) > 0: # time > 0
x.masked_fill_(~mask_pad, 0.0)
return x.transpose(1, 2), new_cache
================================================
FILE: cosyvoice/transformer/decoder.py
================================================
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Decoder definition."""
from typing import Tuple, List, Optional
import torch
import torch.utils.checkpoint as ckpt
import logging
from cosyvoice.transformer.decoder_layer import DecoderLayer
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
from cosyvoice.utils.class_utils import (
COSYVOICE_EMB_CLASSES,
COSYVOICE_ATTENTION_CLASSES,
COSYVOICE_ACTIVATION_CLASSES,
)
from cosyvoice.utils.mask import (subsequent_mask, make_pad_mask)
class TransformerDecoder(torch.nn.Module):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
src_attention: if false, encoder-decoder cross attention is not
applied, such as CIF model
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
tie_word_embedding: Tie or clone module weights depending of whether we are
using TorchScript or not
"""
def __init__(
self,
vocab_size: int,
encoder_output_size: int,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
dropout_rate: float = 0.1,
positional_dropout_rate: float = 0.1,
self_attention_dropout_rate: float = 0.0,
src_attention_dropout_rate: float = 0.0,
input_layer: str = "embed",
use_output_layer: bool = True,
normalize_before: bool = True,
src_attention: bool = True,
key_bias: bool = True,
activation_type: str = "relu",
gradient_checkpointing: bool = False,
tie_word_embedding: bool = False,
):
super().__init__()
attention_dim = encoder_output_size
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
self.embed = torch.nn.Sequential(
torch.nn.Identity() if input_layer == "no_pos" else
torch.nn.Embedding(vocab_size, attention_dim),
COSYVOICE_EMB_CLASSES[input_layer](attention_dim,
positional_dropout_rate),
)
self.normalize_before = normalize_before
self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
self.use_output_layer = use_output_layer
if use_output_layer:
self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
else:
self.output_layer = torch.nn.Identity()
self.num_blocks = num_blocks
self.decoders = torch.nn.ModuleList([
DecoderLayer(
attention_dim,
COSYVOICE_ATTENTION_CLASSES["selfattn"](
attention_heads, attention_dim,
self_attention_dropout_rate, key_bias),
COSYVOICE_ATTENTION_CLASSES["selfattn"](
attention_heads, attention_dim, src_attention_dropout_rate,
key_bias) if src_attention else None,
PositionwiseFeedForward(attention_dim, linear_units,
dropout_rate, activation),
dropout_rate,
normalize_before,
) for _ in range(self.num_blocks)
])
self.gradient_checkpointing = gradient_checkpointing
self.tie_word_embedding = tie_word_embedding
def forward(
self,
memory: torch.Tensor,
memory_mask: torch.Tensor,
ys_in_pad: torch.Tensor,
ys_in_lens: torch.Tensor,
r_ys_in_pad: torch.Tensor = torch.empty(0),
reverse_weight: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: not used in transformer decoder, in order to unify api
with bidirectional decoder
reverse_weight: not used in transformer decoder, in order to unify
api with bidirectional decode
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
torch.tensor(0.0), in order to unify api with bidirectional decoder
olens: (batch, )
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
tgt = ys_in_pad
maxlen = tgt.size(1)
# tgt_mask: (B, 1, L)
tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
tgt_mask = tgt_mask.to(tgt.device)
# m: (1, L, L)
m = subsequent_mask(tgt_mask.size(-1),
device=tgt_mask.device).unsqueeze(0)
# tgt_mask: (B, L, L)
tgt_mask = tgt_mask & m
x, _ = self.embed(tgt)
if self.gradient_checkpointing and self.training:
x = self.forward_layers_checkpointed(x, tgt_mask, memory,
memory_mask)
else:
x = self.forward_layers(x, tgt_mask, memory, memory_mask)
if self.normalize_before:
x = self.after_norm(x)
if self.use_output_layer:
x = self.output_layer(x)
olens = tgt_mask.sum(1)
return x, torch.tensor(0.0), olens
def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
memory: torch.Tensor,
memory_mask: torch.Tensor) -> torch.Tensor:
for layer in self.decoders:
x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
memory_mask)
return x
@torch.jit.ignore(drop=True)
def forward_layers_checkpointed(self, x: torch.Tensor,
tgt_mask: torch.Tensor,
memory: torch.Tensor,
memory_mask: torch.Tensor) -> torch.Tensor:
for layer in self.decoders:
x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
layer.__call__, x, tgt_mask, memory, memory_mask)
return x
def forward_one_step(
self,
memory: torch.Tensor,
memory_mask: torch.Tensor,
tgt: torch.Tensor,
tgt_mask: torch.Tensor,
cache: Optional[List[torch.Tensor]] = None,
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
x, _ = self.embed(tgt)
new_cache = []
for i, decoder in enumerate(self.decoders):
if cache is None:
c = None
else:
c = cache[i]
x, tgt_mask, memory, memory_mask = decoder(x,
tgt_mask,
memory,
memory_mask,
cache=c)
new_cache.append(x)
if self.normalize_before:
y = self.after_norm(x[:, -1])
else:
y = x[:, -1]
if self.use_output_layer:
y = torch.log_softmax(self.output_layer(y), dim=-1)
return y, new_cache
def tie_or_clone_weights(self, jit_mode: bool = True):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
if not self.use_output_layer:
return
if jit_mode:
logging.info("clone emb.weight to output.weight")
self.output_layer.weight = torch.nn.Parameter(
self.embed[0].weight.clone())
else:
logging.info("tie emb.weight with output.weight")
self.output_layer.weight = self.embed[0].weight
if getattr(self.output_layer, "bias", None) is not None:
self.output_layer.bias.data = torch.nn.functional.pad(
self.output_layer.bias.data,
(
0,
self.output_layer.weight.shape[0] -
self.output_layer.bias.shape[0],
),
"constant",
0,
)
class BiTransformerDecoder(torch.nn.Module):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
r_num_blocks: the number of right to left decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
key_bias: whether use bias in attention.linear_k, False for whisper models.
"""
def __init__(
self,
vocab_size: int,
encoder_output_size: int,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
r_num_blocks: int = 0,
dropout_rate: float = 0.1,
positional_dropout_rate: float = 0.1,
self_attention_dropout_rate: float = 0.0,
src_attention_dropout_rate: float = 0.0,
input_layer: str = "embed",
use_output_layer: bool = True,
normalize_before: bool = True,
key_bias: bool = True,
gradient_checkpointing: bool = False,
tie_word_embedding: bool = False,
):
super().__init__()
self.tie_word_embedding = tie_word_embedding
self.left_decoder = TransformerDecoder(
vocab_size,
encoder_output_size,
attention_heads,
linear_units,
num_blocks,
dropout_rate,
positional_dropout_rate,
self_attention_dropout_rate,
src_attention_dropout_rate,
input_layer,
use_output_layer,
normalize_before,
key_bias=key_bias,
gradient_checkpointing=gradient_checkpointing,
tie_word_embedding=tie_word_embedding)
self.right_decoder = TransformerDecoder(
vocab_size,
encoder_output_size,
attention_heads,
linear_units,
r_num_blocks,
dropout_rate,
positional_dropout_rate,
self_attention_dropout_rate,
src_attention_dropout_rate,
input_layer,
use_output_layer,
normalize_before,
key_bias=key_bias,
gradient_checkpointing=gradient_checkpointing,
tie_word_embedding=tie_word_embedding)
def forward(
self,
memory: torch.Tensor,
memory_mask: torch.Tensor,
ys_in_pad: torch.Tensor,
ys_in_lens: torch.Tensor,
r_ys_in_pad: torch.Tensor,
reverse_weight: float = 0.0,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
used for right to left decoder
reverse_weight: used for right to left decoder
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out,
vocab_size) if use_output_layer is True,
r_x: x: decoded token score (right to left decoder)
before softmax (batch, maxlen_out, vocab_size)
if use_output_layer is True,
olens: (batch, )
"""
l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
ys_in_lens)
r_x = torch.tensor(0.0)
if reverse_weight > 0.0:
r_x, _, olens = self.right_decoder(memory, memory_mask,
r_ys_in_pad, ys_in_lens)
return l_x, r_x, olens
def forward_one_step(
self,
memory: torch.Tensor,
memory_mask: torch.Tensor,
tgt: torch.Tensor,
tgt_mask: torch.Tensor,
cache: Optional[List[torch.Tensor]] = None,
) -> Tuple[torch.Tensor, List[torch.Tensor]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=torch.uint8 in PyTorch 1.2-
dtype=torch.bool in PyTorch 1.2+ (include 1.2)
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
tgt_mask, cache)
def tie_or_clone_weights(self, jit_mode: bool = True):
"""Tie or clone module weights (between word_emb and output_layer)
depending of whether we are using TorchScript or not"""
self.left_decoder.tie_or_clone_weights(jit_mode)
self.right_decoder.tie_or_clone_weights(jit_mode)
================================================
FILE: cosyvoice/transformer/decoder_layer.py
================================================
# Copyright (c) 2019 Shigeki Karita
# 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
from typing import Optional, Tuple
import torch
from torch import nn
class DecoderLayer(nn.Module):
"""Single decoder layer module.
Args:
size (int): Input dimension.
self_attn (torch.nn.Module): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (torch.nn.Module): Inter-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
If `None` is passed, Inter-attention is not used, such as
CIF, GPT, and other decoder only model.
feed_forward (torch.nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
"""
def __init__(
self,
size: int,
self_attn: nn.Module,
src_attn: Optional[nn.Module],
feed_forward: nn.Module,
dropout_rate: float,
normalize_before: bool = True,
):
"""Construct an DecoderLayer object."""
super().__init__()
self.size = size
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
self.norm1 = nn.LayerNorm(size, eps=1e-5)
self.norm2 = nn.LayerNorm(size, eps=1e-5)
self.norm3 = nn.LayerNorm(size, eps=1e-5)
self.dropout = nn.Dropout(dropout_rate)
self.normalize_before = normalize_before
def forward(
self,
tgt: torch.Tensor,
tgt_mask: torch.Tensor,
memory: torch.Tensor,
memory_mask: torch.Tensor,
cache: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""Compute decoded features.
Args:
tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
tgt_mask (torch.Tensor): Mask for input tensor
(#batch, maxlen_out).
memory (torch.Tensor): Encoded memory
(#batch, maxlen_in, size).
memory_mask (torch.Tensor): Encoded memory mask
(#batch, maxlen_in).
cache (torch.Tensor): cached tensors.
(#batch, maxlen_out - 1, size).
Returns:
torch.Tensor: Output tensor (#batch, maxlen_out, size).
torch.Tensor: Mask for output tensor (#batch, maxlen_out).
torch.Tensor: Encoded memory (#batch, maxlen_in, size).
torch.Tensor: Encoded memory mask (#batch, maxlen_in).
"""
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
if cache is None:
tgt_q = tgt
tgt_q_mask = tgt_mask
else:
# compute only the last frame query keeping dim: max_time_out -> 1
assert cache.shape == (
tgt.shape[0],
tgt.shape[1] - 1,
self.size,
), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
tgt_q = tgt[:, -1:, :]
residual = residual[:, -1:, :]
tgt_q_mask = tgt_mask[:, -1:, :]
x = residual + self.dropout(
self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
if not self.normalize_before:
x = self.norm1(x)
if self.src_attn is not None:
residual = x
if self.normalize_before:
x = self.norm2(x)
x = residual + self.dropout(
self.src_attn(x, memory, memory, memory_mask)[0])
if not self.normalize_before:
x = self.norm2(x)
residual = x
if self.normalize_before:
x = self.norm3(x)
x = residual + self.dropout(self.feed_forward(x))
if not self.normalize_before:
x = self.norm3(x)
if cache is not None:
x = torch.cat([cache, x], dim=1)
return x, tgt_mask, memory, memory_mask
================================================
FILE: cosyvoice/transformer/embedding.py
================================================
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Positonal Encoding Module."""
import math
from typing import Tuple, Union
import torch
import torch.nn.functional as F
import numpy as np
class PositionalEncoding(torch.nn.Module):
"""Positional encoding.
:param int d_model: embedding dim
:param float dropout_rate: dropout rate
:param int max_len: maximum input length
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
"""
def __init__(self,
d_model: int,
dropout_rate: float,
max_len: int = 5000,
reverse: bool = False):
"""Construct an PositionalEncoding object."""
super().__init__()
self.d_model = d_model
self.xscale = math.sqrt(self.d_model)
self.dropout = torch.nn.Dropout(p=dropout_rate)
self.max_len = max_len
self.pe = torch.zeros(self.max_len, self.d_model)
position = torch.arange(0, self.max_len,
dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.float32) *
-(math.log(10000.0) / self.d_model))
self.pe[:, 0::2] = torch.sin(position * div_term)
self.pe[:, 1::2] = torch.cos(position * div_term)
self.pe = self.pe.unsqueeze(0)
def forward(self,
x: torch.Tensor,
offset: Union[int, torch.Tensor] = 0) \
-> Tuple[torch.Tensor, torch.Tensor]:
"""Add positional encoding.
Args:
x (torch.Tensor): Input. Its shape is (batch, time, ...)
offset (int, torch.tensor): position offset
Returns:
torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
torch.Tensor: for compatibility to RelPositionalEncoding
"""
self.pe = self.pe.to(x.device)
pos_emb = self.position_encoding(offset, x.size(1), False)
x = x * self.xscale + pos_emb
return self.dropout(x), self.dropout(pos_emb)
def position_encoding(self,
offset: Union[int, torch.Tensor],
size: int,
apply_dropout: bool = True) -> torch.Tensor:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
# How to subscript a Union type:
# https://github.com/pytorch/pytorch/issues/69434
if isinstance(offset, int):
assert offset + size <= self.max_len
pos_emb = self.pe[:, offset:offset + size]
elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar
assert offset + size <= self.max_len
pos_emb = self.pe[:, offset:offset + size]
else: # for batched streaming decoding on GPU
assert torch.max(offset) + size <= self.max_len
index = offset.unsqueeze(1) + \
torch.arange(0, size).to(offset.device) # B X T
flag = index > 0
# remove negative offset
index = index * flag
pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model
if apply_dropout:
pos_emb = self.dropout(pos_emb)
return pos_emb
class RelPositionalEncoding(PositionalEncoding):
"""Relative positional encoding module.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
"""Initialize class."""
super().__init__(d_model, dropout_rate, max_len, reverse=True)
def forward(self,
x: torch.Tensor,
offset: Union[int, torch.Tensor] = 0) \
-> Tuple[torch.Tensor, torch.Tensor]:
"""Compute positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
torch.Tensor: Positional embedding tensor (1, time, `*`).
"""
self.pe = self.pe.to(x.device)
x = x * self.xscale
pos_emb = self.position_encoding(offset, x.size(1), False)
return self.dropout(x), self.dropout(pos_emb)
class WhisperPositionalEncoding(PositionalEncoding):
""" Sinusoids position encoding used in openai-whisper.encoder
"""
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
super().__init__(d_model, dropout_rate, max_len)
self.xscale = 1.0
log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
inv_timescales = torch.exp(-log_timescale_increment *
torch.arange(d_model // 2))
scaled_time = torch.arange(max_len)[:, np.newaxis] * \
inv_timescales[np.newaxis, :]
pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
delattr(self, "pe")
self.register_buffer("pe", pe.unsqueeze(0))
class LearnablePositionalEncoding(PositionalEncoding):
""" Learnable position encoding used in openai-whisper.decoder
"""
def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
super().__init__(d_model, dropout_rate, max_len)
# NOTE(xcsong): overwrite self.pe & self.xscale
self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
self.xscale = 1.0
class NoPositionalEncoding(torch.nn.Module):
""" No position encoding
"""
def __init__(self, d_model: int, dropout_rate: float):
super().__init__()
self.d_model = d_model
self.dropout = torch.nn.Dropout(p=dropout_rate)
def forward(self,
x: torch.Tensor,
offset: Union[int, torch.Tensor] = 0) \
-> Tuple[torch.Tensor, torch.Tensor]:
""" Just return zero vector for interface compatibility
"""
pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
return self.dropout(x), pos_emb
def position_encoding(self, offset: Union[int, torch.Tensor],
size: int) -> torch.Tensor:
return torch.zeros(1, size, self.d_model)
class EspnetRelPositionalEncoding(torch.nn.Module):
"""Relative positional encoding module (new implementation).
Details can be found in https://github.com/espnet/espnet/pull/2816.
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000):
"""Construct an PositionalEncoding object."""
super(EspnetRelPositionalEncoding, self).__init__()
self.d_model = d_model
self.xscale = math.sqrt(self.d_model)
self.dropout = torch.nn.Dropout(p=dropout_rate)
self.pe = None
self.extend_pe(torch.tensor(0.0).expand(1, max_len))
def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
# self.pe contains both positive and negative parts
# the length of self.pe is 2 * input_len - 1
if self.pe.size(1) >= x.size(1) * 2 - 1:
if self.pe.dtype != x.dtype or self.pe.device != x.device:
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return
# Suppose `i` means to the position of query vecotr and `j` means the
# position of key vector. We use position relative positions when keys
# are to the left (i>j) and negative relative positions otherwise (i<j).
pe_positive = torch.zeros(x.size(1), self.d_model)
pe_negative = torch.zeros(x.size(1), self.d_model)
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.d_model, 2, dtype=torch.float32)
* -(math.log(10000.0) / self.d_model)
)
pe_positive[:, 0::2] = torch.sin(position * div_term)
pe_positive[:, 1::2] = torch.cos(position * div_term)
pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
# Reserve the order of positive indices and concat both positive and
# negative indices. This is used to support the shifting trick
# as in https://arxiv.org/abs/1901.02860
pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
pe_negative = pe_negative[1:].unsqueeze(0)
pe = torch.cat([pe_positive, pe_negative], dim=1)
self.pe = pe.to(device=x.device, dtype=x.dtype)
def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
"""Add positional encoding.
Args:
x (torch.Tensor): Input tensor (batch, time, `*`).
Returns:
torch.Tensor: Encoded tensor (batch, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale
pos_emb = self.position_encoding(size=x.size(1), offset=offset)
return self.dropout(x), self.dropout(pos_emb)
def position_encoding(self,
offset: Union[int, torch.Tensor],
size: int) -> torch.Tensor:
""" For getting encoding in a streaming fashion
Attention!!!!!
we apply dropout only once at the whole utterance level in a none
streaming way, but will call this function several times with
increasing input size in a streaming scenario, so the dropout will
be applied several times.
Args:
offset (int or torch.tensor): start offset
size (int): required size of position encoding
Returns:
torch.Tensor: Corresponding encoding
"""
pos_emb = self.pe[
:,
self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
]
return pos_emb
================================================
FILE: cosyvoice/transformer/encoder.py
================================================
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
# 2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder definition."""
from typing import Tuple
import torch
import torch.utils.checkpoint as ckpt
from cosyvoice.transformer.convolution import ConvolutionModule
from cosyvoice.transformer.encoder_layer import TransformerEncoderLayer
from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
from cosyvoice.utils.class_utils import (
COSYVOICE_EMB_CLASSES,
COSYVOICE_SUBSAMPLE_CLASSES,
COSYVOICE_ATTENTION_CLASSES,
COSYVOICE_ACTIVATION_CLASSES,
)
from cosyvoice.utils.mask import make_pad_mask
from cosyvoice.utils.mask import add_optional_chunk_mask
class BaseEncoder(torch.nn.Module):
def __init__(
self,
input_size: int,
output_size: int = 256,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
dropout_rate: float = 0.1,
positional_dropout_rate: float = 0.1,
attention_dropout_rate: float = 0.0,
input_layer: str = "conv2d",
pos_enc_layer_type: str = "abs_pos",
normalize_before: bool = True,
static_chunk_size: int = 0,
use_dynamic_chunk: bool = False,
global_cmvn: torch.nn.Module = None,
use_dynamic_left_chunk: bool = False,
gradient_checkpointing: bool = False,
):
"""
Args:
input_size (int): input dim
output_size (int): dimension of attention
attention_heads (int): the number of heads of multi head attention
linear_units (int): the hidden units number of position-wise feed
forward
num_blocks (int): the number of decoder blocks
dropout_rate (float): dropout rate
attention_dropout_rate (float): dropout rate in attention
positional_dropout_rate (float): dropout rate after adding
positional encoding
input_layer (str): input layer type.
optional [linear, conv2d, conv2d6, conv2d8]
pos_enc_layer_type (str): Encoder positional encoding layer type.
opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
normalize_before (bool):
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
static_chunk_size (int): chunk size for static chunk training and
decoding
use_dynamic_chunk (bool): whether use dynamic chunk size for
training or not, You can only use fixed chunk(chunk_size > 0)
or dyanmic chunk size(use_dynamic_chunk = True)
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
key_bias: whether use bias in attention.linear_k, False for whisper models.
gradient_checkpointing: rerunning a forward-pass segment for each
checkpointed segment during backward.
"""
super().__init__()
self._output_size = output_size
self.global_cmvn = global_cmvn
self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
input_size,
output_size,
dropout_rate,
COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
positional_dropout_rate),
)
self.normalize_before = normalize_before
self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
self.static_chunk_size = static_chunk_size
self.use_dynamic_chunk = use_dynamic_chunk
self.use_dynamic_left_chunk = use_dynamic_left_chunk
self.gradient_checkpointing = gradient_checkpointing
def output_size(self) -> int:
return self._output_size
def forward(
self,
xs: torch.Tensor,
xs_lens: torch.Tensor,
decoding_chunk_size: int = 0,
num_decoding_left_chunks: int = -1,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Embed positions in tensor.
Args:
xs: padded input tensor (B, T, D)
xs_lens: input length (B)
decoding_chunk_size: decoding chunk size for dynamic chunk
0: default for training, use random dynamic chunk.
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
num_decoding_left_chunks: number of left chunks, this is for decoding,
the chunk size is decoding_chunk_size.
>=0: use num_decoding_left_chunks
<0: use all left chunks
Returns:
encoder output tensor xs, and subsampled masks
xs: padded output tensor (B, T' ~= T/subsample_rate, D)
masks: torch.Tensor batch padding mask after subsample
(B, 1, T' ~= T/subsample_rate)
NOTE(xcsong):
We pass the `__call__` method of the modules instead of `forward` to the
checkpointing API because `__call__` attaches all the hooks of the module.
https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
"""
T = xs.size(1)
masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T)
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks)
mask_pad = masks # (B, 1, T/subsample_rate)
chunk_masks = add_optional_chunk_mask(xs, masks,
self.use_dynamic_chunk,
self.use_dynamic_left_chunk,
decoding_chunk_size,
self.static_chunk_size,
num_decoding_left_chunks)
if self.gradient_checkpointing and self.training:
xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
mask_pad)
else:
xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
if self.normalize_before:
xs = self.after_norm(xs)
# Here we assume the mask is not changed in encoder layers, so just
# return the masks before encoder layers, and the masks will be used
# for cross attention with decoder later
return xs, masks
def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
pos_emb: torch.Tensor,
mask_pad: torch.Tensor) -> torch.Tensor:
for layer in self.encoders:
xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
return xs
@torch.jit.ignore(drop=True)
def forward_layers_checkpointed(self, xs: torch.Tensor,
chunk_masks: torch.Tensor,
pos_emb: torch.Tensor,
mask_pad: torch.Tensor) -> torch.Tensor:
for layer in self.encoders:
xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs,
chunk_masks, pos_emb,
mask_pad)
return xs
def forward_chunk(
self,
xs: torch.Tensor,
offset: int,
required_cache_size: int,
att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
""" Forward just one chunk
Args:
xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
where `time == (chunk_size - 1) * subsample_rate + \
subsample.right_context + 1`
offset (int): current offset in encoder output time stamp
required_cache_size (int): cache size required for next chunk
compuation
>=0: actual cache size
<0: means all history cache is required
att_cache (torch.Tensor): cache tensor for KEY & VALUE in
transformer/conformer attention, with shape
(elayers, head, cache_t1, d_k * 2), where
`head * d_k == hidden-dim` and
`cache_t1 == chunk_size * num_decoding_left_chunks`.
cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
(elayers, b=1, hidden-dim, cache_t2), where
`cache_t2 == cnn.lorder - 1`
Returns:
torch.Tensor: output of current input xs,
with shape (b=1, chunk_size, hidden-dim).
torch.Tensor: new attention cache required for next chunk, with
dynamic shape (elayers, head, ?, d_k * 2)
depending on required_cache_size.
torch.Tensor: new conformer cnn cache required for next chunk, with
same shape as the original cnn_cache.
"""
assert xs.size(0) == 1
# tmp_masks is just for interface compatibility
tmp_masks = torch.ones(1,
xs.size(1),
device=xs.device,
dtype=torch.bool)
tmp_masks = tmp_masks.unsqueeze(1)
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
# NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
# NOTE(xcsong): After embed, shape(xs) is (b=1, chunk_size, hidden-dim)
elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
chunk_size = xs.size(1)
attention_key_size = cache_t1 + chunk_size
pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
size=attention_key_size)
if required_cache_size < 0:
next_cache_start = 0
elif required_cache_size == 0:
next_cache_start = attention_key_size
else:
next_cache_start = max(attention_key_size - required_cache_size, 0)
r_att_cache = []
r_cnn_cache = []
for i, layer in enumerate(self.encoders):
# NOTE(xcsong): Before layer.forward
# shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
xs, _, new_att_cache, new_cnn_cache = layer(
xs,
att_mask,
pos_emb,
att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
# NOTE(xcsong): After layer.forward
# shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
# shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
if self.normalize_before:
xs = self.after_norm(xs)
# NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
# ? may be larger than cache_t1, it depends on required_cache_size
r_att_cache = torch.cat(r_att_cache, dim=0)
# NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
return (xs, r_att_cache, r_cnn_cache)
def forward_chunk_by_chunk(
self,
xs: torch.Tensor,
decoding_chunk_size: int,
num_decoding_left_chunks: int = -1,
) -> Tuple[torch.Tensor, torch.Tensor]:
""" Forward input chunk by chunk with chunk_size like a streaming
fashion
Here we should pay special attention to computation cache in the
streaming style forward chunk by chunk. Three things should be taken
into account for computation in the current network:
1. transformer/conformer encoder layers output cache
2. convolution in conformer
3. convolution in subsampling
However, we don't implement subsampling cache for:
1. We can control subsampling module to output the right result by
overlapping input instead of cache left context, even though it
wastes some computation, but subsampling only takes a very
small fraction of computation in the whole model.
2. Typically, there are several covolution layers with subsampling
in subsampling module, it is tricky and complicated to do cache
with different convolution layers with different subsampling
rate.
3. Currently, nn.Sequential is used to stack all the convolution
layers in subsampling, we need to rewrite it to make it work
with cache, which is not preferred.
Args:
xs (torch.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size
"""
assert decoding_chunk_size > 0
# The model is trained by static or dynamic chunk
assert self.static_chunk_size > 0 or self.use_dynamic_chunk
subsampling = self.embed.subsampling_rate
context = self.embed.right_context + 1 # Add current frame
stride = subsampling * decoding_chunk_size
decoding_window = (decoding_chunk_size - 1) * subsampling + context
num_frames = xs.size(1)
att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
outputs = []
offset = 0
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
# Feed forward overlap input step by step
for cur in range(0, num_frames - context + 1, stride):
end = min(cur + decoding_window, num_frames)
chunk_xs = xs[:, cur:end, :]
(y, att_cache,
cnn_cache) = self.forward_chunk(chunk_xs, offset,
required_cache_size, att_cache,
cnn_cache)
outputs.append(y)
offset += y.size(1)
ys = torch.cat(outputs, 1)
masks = torch.ones((1, 1, ys.size(1)),
device=ys.device,
dtype=torch.bool)
return ys, masks
class TransformerEncoder(BaseEncoder):
"""Transformer encoder module."""
def __init__(
self,
input_size: int,
output_size: int = 256,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
dropout_rate: float = 0.1,
positional_dropout_rate: float = 0.1,
attention_dropout_rate: float = 0.0,
input_layer: str = "conv2d",
pos_enc_layer_type: str = "abs_pos",
normalize_before: bool = True,
static_chunk_size: int = 0,
use_dynamic_chunk: bool = False,
global_cmvn: torch.nn.Module = None,
use_dynamic_left_chunk: bool = False,
key_bias: bool = True,
selfattention_layer_type: str = "selfattn",
activation_type: str = "relu",
gradient_checkpointing: bool = False,
):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
"""
super().__init__(input_size, output_size, attention_heads,
linear_units, num_blocks, dropout_rate,
positional_dropout_rate, attention_dropout_rate,
input_layer, pos_enc_layer_type, normalize_before,
static_chunk_size, use_dynamic_chunk, global_cmvn,
use_dynamic_left_chunk, gradient_checkpointing)
activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
self.encoders = torch.nn.ModuleList([
TransformerEncoderLayer(
output_size,
COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](attention_heads,
output_size,
attention_dropout_rate,
key_bias),
PositionwiseFeedForward(output_size, linear_units,
dropout_rate, activation),
dropout_rate, normalize_before) for _ in range(num_blocks)
])
class ConformerEncoder(BaseEncoder):
"""Conformer encoder module."""
def __init__(
self,
input_size: int,
output_size: int = 256,
attention_heads: int = 4,
linear_units: int = 2048,
num_blocks: int = 6,
gitextract_wldclo_i/
├── .dockerignore
├── Dockerfile
├── LICENSE
├── README.md
├── api.py
├── batch_inference.py
├── compose.yaml
├── cosyvoice/
│ ├── __init__.py
│ ├── bin/
│ │ ├── inference.py
│ │ └── train.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── cosyvoice.py
│ │ ├── frontend.py
│ │ └── model.py
│ ├── dataset/
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ └── processor.py
│ ├── flow/
│ │ ├── decoder.py
│ │ ├── flow.py
│ │ ├── flow_matching.py
│ │ └── length_regulator.py
│ ├── hifigan/
│ │ ├── f0_predictor.py
│ │ └── generator.py
│ ├── llm/
│ │ └── llm.py
│ ├── transformer/
│ │ ├── __init__.py
│ │ ├── activation.py
│ │ ├── attention.py
│ │ ├── convolution.py
│ │ ├── decoder.py
│ │ ├── decoder_layer.py
│ │ ├── embedding.py
│ │ ├── encoder.py
│ │ ├── encoder_layer.py
│ │ ├── label_smoothing_loss.py
│ │ ├── positionwise_feed_forward.py
│ │ └── subsampling.py
│ └── utils/
│ ├── __init__.py
│ ├── class_utils.py
│ ├── common.py
│ ├── executor.py
│ ├── file_utils.py
│ ├── frontend_utils.py
│ ├── mask.py
│ ├── scheduler.py
│ └── train_utils.py
├── data/
│ └── batch_files.csv
├── openai_api_inference.py
├── requirements.txt
├── results/
│ └── .gitkeep
├── run_batch_inference.sh
├── run_single_inference.sh
├── single_inference.py
├── third_party/
│ └── Matcha-TTS/
│ ├── LICENSE
│ ├── MANIFEST.in
│ ├── Makefile
│ ├── README.md
│ ├── configs/
│ │ ├── __init__.py
│ │ ├── callbacks/
│ │ │ ├── default.yaml
│ │ │ ├── model_checkpoint.yaml
│ │ │ ├── model_summary.yaml
│ │ │ ├── none.yaml
│ │ │ └── rich_progress_bar.yaml
│ │ ├── data/
│ │ │ ├── hi-fi_en-US_female.yaml
│ │ │ ├── ljspeech.yaml
│ │ │ └── vctk.yaml
│ │ ├── debug/
│ │ │ ├── default.yaml
│ │ │ ├── fdr.yaml
│ │ │ ├── limit.yaml
│ │ │ ├── overfit.yaml
│ │ │ └── profiler.yaml
│ │ ├── eval.yaml
│ │ ├── experiment/
│ │ │ ├── hifi_dataset_piper_phonemizer.yaml
│ │ │ ├── ljspeech.yaml
│ │ │ ├── ljspeech_min_memory.yaml
│ │ │ └── multispeaker.yaml
│ │ ├── extras/
│ │ │ └── default.yaml
│ │ ├── hparams_search/
│ │ │ └── mnist_optuna.yaml
│ │ ├── hydra/
│ │ │ └── default.yaml
│ │ ├── local/
│ │ │ └── .gitkeep
│ │ ├── logger/
│ │ │ ├── aim.yaml
│ │ │ ├── comet.yaml
│ │ │ ├── csv.yaml
│ │ │ ├── many_loggers.yaml
│ │ │ ├── mlflow.yaml
│ │ │ ├── neptune.yaml
│ │ │ ├── tensorboard.yaml
│ │ │ └── wandb.yaml
│ │ ├── model/
│ │ │ ├── cfm/
│ │ │ │ └── default.yaml
│ │ │ ├── decoder/
│ │ │ │ └── default.yaml
│ │ │ ├── encoder/
│ │ │ │ └── default.yaml
│ │ │ ├── matcha.yaml
│ │ │ └── optimizer/
│ │ │ └── adam.yaml
│ │ ├── paths/
│ │ │ └── default.yaml
│ │ ├── train.yaml
│ │ └── trainer/
│ │ ├── cpu.yaml
│ │ ├── ddp.yaml
│ │ ├── ddp_sim.yaml
│ │ ├── default.yaml
│ │ ├── gpu.yaml
│ │ └── mps.yaml
│ ├── matcha/
│ │ ├── VERSION
│ │ ├── __init__.py
│ │ ├── app.py
│ │ ├── cli.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── components/
│ │ │ │ └── __init__.py
│ │ │ └── text_mel_datamodule.py
│ │ ├── hifigan/
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ ├── denoiser.py
│ │ │ ├── env.py
│ │ │ ├── meldataset.py
│ │ │ ├── models.py
│ │ │ └── xutils.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── baselightningmodule.py
│ │ │ ├── components/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── decoder.py
│ │ │ │ ├── flow_matching.py
│ │ │ │ ├── text_encoder.py
│ │ │ │ └── transformer.py
│ │ │ └── matcha_tts.py
│ │ ├── onnx/
│ │ │ ├── __init__.py
│ │ │ ├── export.py
│ │ │ └── infer.py
│ │ ├── text/
│ │ │ ├── __init__.py
│ │ │ ├── cleaners.py
│ │ │ ├── numbers.py
│ │ │ └── symbols.py
│ │ ├── train.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── audio.py
│ │ ├── generate_data_statistics.py
│ │ ├── instantiators.py
│ │ ├── logging_utils.py
│ │ ├── model.py
│ │ ├── monotonic_align/
│ │ │ ├── __init__.py
│ │ │ ├── core.c
│ │ │ ├── core.pyx
│ │ │ └── setup.py
│ │ ├── pylogger.py
│ │ ├── rich_utils.py
│ │ └── utils.py
│ ├── matcha_tts.egg-info/
│ │ ├── PKG-INFO
│ │ ├── SOURCES.txt
│ │ ├── dependency_links.txt
│ │ ├── entry_points.txt
│ │ ├── requires.txt
│ │ └── top_level.txt
│ ├── notebooks/
│ │ └── .gitkeep
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── scripts/
│ │ └── schedule.sh
│ ├── setup.py
│ └── synthesis.ipynb
└── utils/
└── word_utils.py
SYMBOL INDEX (1094 symbols across 65 files)
FILE: api.py
class Settings (line 17) | class Settings(BaseSettings):
class SpeechRequest (line 36) | class SpeechRequest(BaseModel):
function lifespan (line 47) | async def lifespan(app: FastAPI):
function get_models (line 63) | async def get_models(request: Request):
function speach_endpoint (line 78) | async def speach_endpoint(request: Request, payload: SpeechRequest):
FILE: batch_inference.py
function process_batch (line 11) | def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_fo...
function main (line 53) | def main():
FILE: cosyvoice/bin/inference.py
function get_args (line 31) | def get_args():
function main (line 54) | def main():
FILE: cosyvoice/bin/train.py
function get_args (line 38) | def get_args():
function main (line 85) | def main():
FILE: cosyvoice/cli/cosyvoice.py
class CosyVoice (line 21) | class CosyVoice:
method __init__ (line 23) | def __init__(self, model_dir):
method list_avaliable_spks (line 43) | def list_avaliable_spks(self):
method inference_sft (line 47) | def inference_sft(self, tts_text, spk_id):
method inference_zero_shot (line 55) | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
method inference_cross_lingual (line 64) | def inference_cross_lingual(self, tts_text, prompt_speech_16k):
method inference_instruct (line 74) | def inference_instruct(self, tts_text, spk_id, instruct_text):
FILE: cosyvoice/cli/frontend.py
class CosyVoiceFrontEnd (line 36) | class CosyVoiceFrontEnd:
method __init__ (line 38) | def __init__(self,
method _extract_text_token (line 72) | def _extract_text_token(self, text):
method _extract_speech_token (line 78) | def _extract_speech_token(self, speech):
method _extract_spk_embedding (line 86) | def _extract_spk_embedding(self, speech):
method _extract_speech_feat (line 96) | def _extract_speech_feat(self, speech):
method text_normalize (line 102) | def text_normalize(self, text, split=True):
method frontend_sft (line 132) | def frontend_sft(self, tts_text, spk_id):
method frontend_zero_shot (line 138) | def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
method frontend_cross_lingual (line 153) | def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
method frontend_instruct (line 162) | def frontend_instruct(self, tts_text, spk_id, instruct_text):
FILE: cosyvoice/cli/model.py
class CosyVoiceModel (line 16) | class CosyVoiceModel:
method __init__ (line 18) | def __init__(self,
method load (line 27) | def load(self, llm_model, flow_model, hift_model):
method inference (line 35) | def inference(self, text, text_len, flow_embedding, llm_embedding=torc...
FILE: cosyvoice/dataset/dataset.py
class Processor (line 27) | class Processor(IterableDataset):
method __init__ (line 29) | def __init__(self, source, f, *args, **kw):
method set_epoch (line 36) | def set_epoch(self, epoch):
method __iter__ (line 39) | def __iter__(self):
method apply (line 47) | def apply(self, f):
class DistributedSampler (line 52) | class DistributedSampler:
method __init__ (line 54) | def __init__(self, shuffle=True, partition=True):
method update (line 60) | def update(self):
method set_epoch (line 80) | def set_epoch(self, epoch):
method sample (line 83) | def sample(self, data):
class DataList (line 108) | class DataList(IterableDataset):
method __init__ (line 110) | def __init__(self, lists, shuffle=True, partition=True):
method set_epoch (line 114) | def set_epoch(self, epoch):
method __iter__ (line 117) | def __iter__(self):
function Dataset (line 126) | def Dataset(data_list_file,
FILE: cosyvoice/dataset/processor.py
function parquet_opener (line 29) | def parquet_opener(data, mode='train', tts_data={}):
function filter (line 57) | def filter(data,
function resample (line 108) | def resample(data, resample_rate=22050, min_sample_rate=16000, mode='tra...
function compute_fbank (line 136) | def compute_fbank(data,
function parse_embedding (line 159) | def parse_embedding(data, normalize, mode='train'):
function tokenize (line 177) | def tokenize(data, get_tokenizer, allowed_special, mode='train'):
function shuffle (line 196) | def shuffle(data, shuffle_size=10000, mode='train'):
function sort (line 220) | def sort(data, sort_size=500, mode='train'):
function static_batch (line 248) | def static_batch(data, batch_size=16):
function dynamic_batch (line 268) | def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
function batch (line 297) | def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=...
function padding (line 311) | def padding(data, use_spk_embedding, mode='train'):
FILE: cosyvoice/flow/decoder.py
class ConditionalDecoder (line 21) | class ConditionalDecoder(nn.Module):
method __init__ (line 22) | def __init__(
method initialize_weights (line 130) | def initialize_weights(self):
method forward (line 144) | def forward(self, x, mask, mu, t, spks=None, cond=None):
FILE: cosyvoice/flow/flow.py
class MaskedDiffWithXvec (line 24) | class MaskedDiffWithXvec(torch.nn.Module):
method __init__ (line 25) | def __init__(self,
method forward (line 55) | def forward(
method inference (line 100) | def inference(self,
FILE: cosyvoice/flow/flow_matching.py
class ConditionalCFM (line 18) | class ConditionalCFM(BASECFM):
method __init__ (line 19) | def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, ...
method forward (line 34) | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, c...
method solve_euler (line 58) | def solve_euler(self, x, t_span, mu, mask, spks, cond):
method compute_loss (line 99) | def compute_loss(self, x1, mask, mu, spks=None, cond=None):
FILE: cosyvoice/flow/length_regulator.py
class InterpolateRegulator (line 20) | class InterpolateRegulator(nn.Module):
method __init__ (line 21) | def __init__(
method forward (line 43) | def forward(self, x, ylens=None):
FILE: cosyvoice/hifigan/f0_predictor.py
class ConvRNNF0Predictor (line 19) | class ConvRNNF0Predictor(nn.Module):
method __init__ (line 20) | def __init__(self,
method forward (line 52) | def forward(self, x: torch.Tensor) -> torch.Tensor:
FILE: cosyvoice/hifigan/generator.py
class ResBlock (line 41) | class ResBlock(torch.nn.Module):
method __init__ (line 43) | def __init__(
method forward (line 89) | def forward(self, x: torch.Tensor) -> torch.Tensor:
method remove_weight_norm (line 98) | def remove_weight_norm(self):
class SineGen (line 103) | class SineGen(torch.nn.Module):
method __init__ (line 119) | def __init__(self, samp_rate, harmonic_num=0,
method _f02uv (line 129) | def _f02uv(self, f0):
method forward (line 135) | def forward(self, f0):
class SourceModuleHnNSF (line 168) | class SourceModuleHnNSF(torch.nn.Module):
method __init__ (line 186) | def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine...
method forward (line 201) | def forward(self, x):
class HiFTGenerator (line 220) | class HiFTGenerator(nn.Module):
method __init__ (line 225) | def __init__(
method _f02source (line 317) | def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
method _stft (line 323) | def _stft(self, x):
method _istft (line 331) | def _istft(self, magnitude, phase):
method forward (line 338) | def forward(self, x: torch.Tensor) -> torch.Tensor:
method remove_weight_norm (line 375) | def remove_weight_norm(self):
method inference (line 390) | def inference(self, mel: torch.Tensor) -> torch.Tensor:
FILE: cosyvoice/llm/llm.py
class TransformerLM (line 24) | class TransformerLM(torch.nn.Module):
method __init__ (line 25) | def __init__(
method encode (line 66) | def encode(
method pad_unpad_sequence (line 76) | def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_...
method forward (line 84) | def forward(
method sampling_ids (line 132) | def sampling_ids(
method inference (line 148) | def inference(
FILE: cosyvoice/transformer/activation.py
class Swish (line 24) | class Swish(torch.nn.Module):
method forward (line 27) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class Snake (line 34) | class Snake(nn.Module):
method __init__ (line 50) | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha...
method forward (line 73) | def forward(self, x):
FILE: cosyvoice/transformer/attention.py
class MultiHeadedAttention (line 26) | class MultiHeadedAttention(nn.Module):
method __init__ (line 36) | def __init__(self,
method forward_qkv (line 53) | def forward_qkv(
method forward_attention (line 82) | def forward_attention(
method forward (line 129) | def forward(
class RelPositionMultiHeadedAttention (line 200) | class RelPositionMultiHeadedAttention(MultiHeadedAttention):
method __init__ (line 209) | def __init__(self,
method rel_shift (line 225) | def rel_shift(self, x):
method forward (line 245) | def forward(
FILE: cosyvoice/transformer/convolution.py
class ConvolutionModule (line 24) | class ConvolutionModule(nn.Module):
method __init__ (line 27) | def __init__(self,
method forward (line 90) | def forward(
FILE: cosyvoice/transformer/decoder.py
class TransformerDecoder (line 33) | class TransformerDecoder(torch.nn.Module):
method __init__ (line 58) | def __init__(
method forward (line 116) | def forward(
method forward_layers (line 169) | def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
method forward_layers_checkpointed (line 178) | def forward_layers_checkpointed(self, x: torch.Tensor,
method forward_one_step (line 187) | def forward_one_step(
method tie_or_clone_weights (line 230) | def tie_or_clone_weights(self, jit_mode: bool = True):
class BiTransformerDecoder (line 256) | class BiTransformerDecoder(torch.nn.Module):
method __init__ (line 276) | def __init__(
method forward (line 332) | def forward(
method forward_one_step (line 367) | def forward_one_step(
method tie_or_clone_weights (line 392) | def tie_or_clone_weights(self, jit_mode: bool = True):
FILE: cosyvoice/transformer/decoder_layer.py
class DecoderLayer (line 22) | class DecoderLayer(nn.Module):
method __init__ (line 41) | def __init__(
method forward (line 62) | def forward(
FILE: cosyvoice/transformer/embedding.py
class PositionalEncoding (line 26) | class PositionalEncoding(torch.nn.Module):
method __init__ (line 37) | def __init__(self,
method forward (line 59) | def forward(self,
method position_encoding (line 79) | def position_encoding(self,
class RelPositionalEncoding (line 120) | class RelPositionalEncoding(PositionalEncoding):
method __init__ (line 129) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5...
method forward (line 133) | def forward(self,
class WhisperPositionalEncoding (line 150) | class WhisperPositionalEncoding(PositionalEncoding):
method __init__ (line 154) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1...
class LearnablePositionalEncoding (line 167) | class LearnablePositionalEncoding(PositionalEncoding):
method __init__ (line 171) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 4...
class NoPositionalEncoding (line 178) | class NoPositionalEncoding(torch.nn.Module):
method __init__ (line 182) | def __init__(self, d_model: int, dropout_rate: float):
method forward (line 187) | def forward(self,
method position_encoding (line 196) | def position_encoding(self, offset: Union[int, torch.Tensor],
class EspnetRelPositionalEncoding (line 201) | class EspnetRelPositionalEncoding(torch.nn.Module):
method __init__ (line 215) | def __init__(self, d_model, dropout_rate, max_len=5000):
method extend_pe (line 224) | def extend_pe(self, x):
method forward (line 256) | def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
method position_encoding (line 271) | def position_encoding(self,
FILE: cosyvoice/transformer/encoder.py
class BaseEncoder (line 37) | class BaseEncoder(torch.nn.Module):
method __init__ (line 39) | def __init__(
method output_size (line 108) | def output_size(self) -> int:
method forward (line 111) | def forward(
method forward_layers (line 165) | def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
method forward_layers_checkpointed (line 173) | def forward_layers_checkpointed(self, xs: torch.Tensor,
method forward_chunk (line 183) | def forward_chunk(
method forward_chunk_by_chunk (line 273) | def forward_chunk_by_chunk(
class TransformerEncoder (line 336) | class TransformerEncoder(BaseEncoder):
method __init__ (line 339) | def __init__(
class ConformerEncoder (line 385) | class ConformerEncoder(BaseEncoder):
method __init__ (line 388) | def __init__(
FILE: cosyvoice/transformer/encoder_layer.py
class TransformerEncoderLayer (line 24) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 40) | def __init__(
method forward (line 58) | def forward(
class ConformerEncoderLayer (line 109) | class ConformerEncoderLayer(nn.Module):
method __init__ (line 129) | def __init__(
method forward (line 160) | def forward(
FILE: cosyvoice/transformer/label_smoothing_loss.py
class LabelSmoothingLoss (line 21) | class LabelSmoothingLoss(nn.Module):
method __init__ (line 54) | def __init__(self,
method forward (line 68) | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
FILE: cosyvoice/transformer/positionwise_feed_forward.py
class PositionwiseFeedForward (line 20) | class PositionwiseFeedForward(torch.nn.Module):
method __init__ (line 33) | def __init__(
method forward (line 47) | def forward(self, xs: torch.Tensor) -> torch.Tensor:
class MoEFFNLayer (line 58) | class MoEFFNLayer(torch.nn.Module):
method __init__ (line 75) | def __init__(
method forward (line 91) | def forward(self, xs: torch.Tensor) -> torch.Tensor:
FILE: cosyvoice/transformer/subsampling.py
class BaseSubsampling (line 23) | class BaseSubsampling(torch.nn.Module):
method __init__ (line 25) | def __init__(self):
method position_encoding (line 30) | def position_encoding(self, offset: Union[int, torch.Tensor],
class EmbedinigNoSubsampling (line 35) | class EmbedinigNoSubsampling(BaseSubsampling):
method __init__ (line 39) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 45) | def forward(
class LinearNoSubsampling (line 69) | class LinearNoSubsampling(BaseSubsampling):
method __init__ (line 79) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 92) | def forward(
class Conv1dSubsampling2 (line 116) | class Conv1dSubsampling2(BaseSubsampling):
method __init__ (line 128) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 145) | def forward(
class Conv2dSubsampling4 (line 173) | class Conv2dSubsampling4(BaseSubsampling):
method __init__ (line 183) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 202) | def forward(
class Conv2dSubsampling6 (line 230) | class Conv2dSubsampling6(BaseSubsampling):
method __init__ (line 239) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 256) | def forward(
class Conv2dSubsampling8 (line 282) | class Conv2dSubsampling8(BaseSubsampling):
method __init__ (line 292) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 311) | def forward(
class LegacyLinearNoSubsampling (line 338) | class LegacyLinearNoSubsampling(BaseSubsampling):
method __init__ (line 348) | def __init__(self, idim: int, odim: int, dropout_rate: float,
method forward (line 362) | def forward(
FILE: cosyvoice/utils/common.py
function pad_list (line 25) | def pad_list(xs: List[torch.Tensor], pad_value: int):
function th_accuracy (line 74) | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
function get_padding (line 96) | def get_padding(kernel_size, dilation=1):
function init_weights (line 100) | def init_weights(m, mean=0.0, std=0.01):
FILE: cosyvoice/utils/executor.py
class Executor (line 26) | class Executor:
method __init__ (line 28) | def __init__(self):
method train_one_epoc (line 34) | def train_one_epoc(self, model, optimizer, scheduler, train_data_loade...
method cv (line 83) | def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=Tr...
FILE: cosyvoice/utils/file_utils.py
function read_lists (line 20) | def read_lists(list_file):
function read_json_lists (line 27) | def read_json_lists(list_file):
function load_wav (line 35) | def load_wav(wav, target_sr):
function speed_change (line 43) | def speed_change(waveform, sample_rate, speed_factor: str):
FILE: cosyvoice/utils/frontend_utils.py
function contains_chinese (line 19) | def contains_chinese(text):
function replace_corner_mark (line 24) | def replace_corner_mark(text):
function remove_bracket (line 31) | def remove_bracket(text):
function spell_out_number (line 40) | def spell_out_number(text: str, inflect_parser):
function split_paragraph (line 63) | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, toke...
function replace_blank (line 116) | def replace_blank(text: str):
FILE: cosyvoice/utils/mask.py
function subsequent_mask (line 53) | def subsequent_mask(
function subsequent_chunk_mask (line 89) | def subsequent_chunk_mask(
function add_optional_chunk_mask (line 127) | def add_optional_chunk_mask(xs: torch.Tensor,
function make_pad_mask (line 201) | def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
FILE: cosyvoice/utils/scheduler.py
class WarmupLR (line 27) | class WarmupLR(_LRScheduler):
method __init__ (line 44) | def __init__(
method __repr__ (line 56) | def __repr__(self):
method get_lr (line 59) | def get_lr(self):
method set_step (line 70) | def set_step(self, step: int):
class WarmupPolicy (line 74) | class WarmupPolicy(_LRScheduler):
method __init__ (line 84) | def __init__(self,
method get_lr (line 110) | def get_lr(self):
method _get_warmup_lr (line 128) | def _get_warmup_lr(self, step):
method _get_lr (line 132) | def _get_lr(self, step):
class SquareRootConstantPolicy (line 137) | class SquareRootConstantPolicy(_LRScheduler):
method __init__ (line 147) | def __init__(self,
method get_lr (line 175) | def get_lr(self):
method _get_lr (line 193) | def _get_lr(self, step):
class WarmupHoldPolicy (line 198) | class WarmupHoldPolicy(WarmupPolicy):
method __init__ (line 212) | def __init__(
method get_lr (line 257) | def get_lr(self):
class WarmupAnnealHoldPolicy (line 282) | class WarmupAnnealHoldPolicy(_LRScheduler):
method __init__ (line 295) | def __init__(
method get_lr (line 340) | def get_lr(self):
method _get_warmup_lr (line 365) | def _get_warmup_lr(self, step):
method _get_constant_lr (line 369) | def _get_constant_lr(self, step):
method _get_lr (line 372) | def _get_lr(self, step):
function _squareroot_annealing (line 377) | def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
function _square_annealing (line 384) | def _square_annealing(initial_lr, step, max_steps, min_lr):
function _cosine_annealing (line 391) | def _cosine_annealing(initial_lr, step, max_steps, min_lr):
function _linear_warmup_with_cosine_annealing (line 397) | def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step,
function _poly_decay (line 421) | def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
function _noam_hold_annealing (line 433) | def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps,
class SquareAnnealing (line 444) | class SquareAnnealing(WarmupPolicy):
method __init__ (line 446) | def __init__(self,
method _get_lr (line 459) | def _get_lr(self, step):
class SquareRootAnnealing (line 471) | class SquareRootAnnealing(WarmupPolicy):
method __init__ (line 473) | def __init__(self,
method _get_lr (line 486) | def _get_lr(self, step):
class CosineAnnealing (line 497) | class CosineAnnealing(WarmupAnnealHoldPolicy):
method __init__ (line 499) | def __init__(self,
method _get_lr (line 512) | def _get_lr(self, step):
method _get_warmup_lr (line 532) | def _get_warmup_lr(self, step):
method _get_constant_lr (line 539) | def _get_constant_lr(self, step):
method _get_linear_warmup_with_cosine_annealing_lr (line 543) | def _get_linear_warmup_with_cosine_annealing_lr(self, step):
class NoamAnnealing (line 558) | class NoamAnnealing(_LRScheduler):
method __init__ (line 560) | def __init__(self,
method get_lr (line 589) | def get_lr(self):
method _noam_annealing (line 611) | def _noam_annealing(self, initial_lr, step):
class NoamHoldAnnealing (line 624) | class NoamHoldAnnealing(WarmupHoldPolicy):
method __init__ (line 626) | def __init__(self,
method _get_lr (line 694) | def _get_lr(self, step):
method set_step (line 716) | def set_step(self, step: int):
class ConstantLR (line 720) | class ConstantLR(_LRScheduler):
method __init__ (line 727) | def __init__(
method get_lr (line 735) | def get_lr(self):
method set_step (line 738) | def set_step(self, step: int):
FILE: cosyvoice/utils/train_utils.py
function init_distributed (line 40) | def init_distributed(args):
function init_dataset_and_dataloader (line 54) | def init_dataset_and_dataloader(args, configs):
function check_modify_and_save_config (line 73) | def check_modify_and_save_config(args, configs):
function wrap_cuda_model (line 94) | def wrap_cuda_model(args, model):
function init_optimizer_and_scheduler (line 111) | def init_optimizer_and_scheduler(args, configs, model):
function init_summarywriter (line 145) | def init_summarywriter(args):
function save_model (line 153) | def save_model(model, model_name, info_dict):
function cosyvoice_join (line 175) | def cosyvoice_join(group_join, info_dict):
function batch_forward (line 196) | def batch_forward(model, batch, info_dict):
function batch_backward (line 217) | def batch_backward(model, info_dict):
function update_parameter_and_lr (line 228) | def update_parameter_and_lr(model, optimizer, scheduler, info_dict):
function log_per_step (line 245) | def log_per_step(writer, info_dict):
function log_per_save (line 274) | def log_per_save(writer, info_dict):
FILE: single_inference.py
class CustomCosyVoiceFrontEnd (line 29) | class CustomCosyVoiceFrontEnd(CosyVoiceFrontEnd):
method text_normalize_new (line 30) | def text_normalize_new(self,text, split=False):
method frontend_zero_shot (line 112) | def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
method frontend_zero_shot_dual (line 127) | def frontend_zero_shot_dual(self, tts_text, prompt_text, prompt_speech...
class CustomCosyVoiceModel (line 150) | class CustomCosyVoiceModel(CosyVoiceModel):
method __init__ (line 152) | def __init__(self,
method load (line 161) | def load(self, llm_model, flow_model, hift_model):
method inference (line 169) | def inference(self, text, text_len, flow_embedding, llm_embedding=torc...
class CustomCosyVoice (line 200) | class CustomCosyVoice:
method __init__ (line 202) | def __init__(self, model_dir):
method list_avaliable_spks (line 227) | def list_avaliable_spks(self):
method inference_sft (line 231) | def inference_sft(self, tts_text, spk_id):
method inference_zero_shot (line 239) | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
method inference_zero_shot_no_unit_condition_no_normalize (line 248) | def inference_zero_shot_no_unit_condition_no_normalize(self, tts_text,...
method inference_zero_shot_no_normalize (line 266) | def inference_zero_shot_no_normalize(self, tts_text, prompt_text, prom...
function transcribe_audio (line 279) | def transcribe_audio(audio_file):
function get_bopomofo_rare (line 294) | def get_bopomofo_rare(text, converter):
function parse_transcript (line 327) | def parse_transcript(text, end):
function single_inference (line 356) | def single_inference(speaker_prompt_audio_path, content_to_synthesize, o...
function main (line 391) | def main():
FILE: third_party/Matcha-TTS/matcha/app.py
function MATCHA_TTS_LOC (line 33) | def MATCHA_TTS_LOC(x):
function VOCODER_LOC (line 37) | def VOCODER_LOC(x):
function load_model (line 66) | def load_model(model_name, vocoder_name):
function load_model_ui (line 72) | def load_model_ui(model_type, textbox):
function process_text_gradio (line 102) | def process_text_gradio(text):
function synthesise_mel (line 108) | def synthesise_mel(text, text_length, n_timesteps, temperature, length_s...
function multispeaker_example_cacher (line 125) | def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scal...
function ljspeech_example_cacher (line 137) | def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, s...
function main (line 149) | def main():
FILE: third_party/Matcha-TTS/matcha/cli.py
function plot_spectrogram_to_numpy (line 37) | def plot_spectrogram_to_numpy(spectrogram, filename):
function process_text (line 48) | def process_text(i: int, text: str, device: torch.device):
function get_texts (line 62) | def get_texts(args):
function assert_required_models_available (line 71) | def assert_required_models_available(args):
function load_hifigan (line 84) | def load_hifigan(checkpoint_path, device):
function load_vocoder (line 93) | def load_vocoder(vocoder_name, checkpoint_path, device):
function load_matcha (line 108) | def load_matcha(model_name, checkpoint_path, device):
function to_waveform (line 117) | def to_waveform(mel, vocoder, denoiser=None):
function save_to_folder (line 125) | def save_to_folder(filename: str, output: dict, folder: str):
function validate_args (line 134) | def validate_args(args):
function validate_args_for_multispeaker_model (line 163) | def validate_args_for_multispeaker_model(args):
function validate_args_for_single_speaker_model (line 188) | def validate_args_for_single_speaker_model(args):
function cli (line 208) | def cli():
class BatchedSynthesisDataset (line 292) | class BatchedSynthesisDataset(torch.utils.data.Dataset):
method __init__ (line 293) | def __init__(self, processed_texts):
method __len__ (line 296) | def __len__(self):
method __getitem__ (line 299) | def __getitem__(self, idx):
function batched_collate_fn (line 303) | def batched_collate_fn(batch):
function batched_synthesis (line 316) | def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
function unbatched_synthesis (line 358) | def unbatched_synthesis(args, device, model, vocoder, denoiser, texts, s...
function print_config (line 397) | def print_config(args):
function get_device (line 407) | def get_device(args):
FILE: third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py
function parse_filelist (line 15) | def parse_filelist(filelist_path, split_char="|"):
class TextMelDataModule (line 21) | class TextMelDataModule(LightningDataModule):
method __init__ (line 22) | def __init__( # pylint: disable=unused-argument
method setup (line 49) | def setup(self, stage: Optional[str] = None): # pylint: disable=unuse...
method train_dataloader (line 88) | def train_dataloader(self):
method val_dataloader (line 98) | def val_dataloader(self):
method teardown (line 108) | def teardown(self, stage: Optional[str] = None):
method state_dict (line 112) | def state_dict(self): # pylint: disable=no-self-use
method load_state_dict (line 116) | def load_state_dict(self, state_dict: Dict[str, Any]):
class TextMelDataset (line 121) | class TextMelDataset(torch.utils.data.Dataset):
method __init__ (line 122) | def __init__(
method get_datapoint (line 156) | def get_datapoint(self, filepath_and_text):
method get_mel (line 172) | def get_mel(self, filepath):
method get_text (line 189) | def get_text(self, text, add_blank=True):
method __getitem__ (line 196) | def __getitem__(self, index):
method __len__ (line 200) | def __len__(self):
class TextMelBatchCollate (line 204) | class TextMelBatchCollate:
method __init__ (line 205) | def __init__(self, n_spks):
method __call__ (line 208) | def __call__(self, batch):
FILE: third_party/Matcha-TTS/matcha/hifigan/denoiser.py
class Denoiser (line 7) | class Denoiser(torch.nn.Module):
method __init__ (line 10) | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_lengt...
method forward (line 59) | def forward(self, audio, strength=0.0005):
FILE: third_party/Matcha-TTS/matcha/hifigan/env.py
class AttrDict (line 7) | class AttrDict(dict):
method __init__ (line 8) | def __init__(self, *args, **kwargs):
function build_env (line 13) | def build_env(config, config_name, path):
FILE: third_party/Matcha-TTS/matcha/hifigan/meldataset.py
function load_wav (line 17) | def load_wav(full_path):
function dynamic_range_compression (line 22) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
function dynamic_range_decompression (line 26) | def dynamic_range_decompression(x, C=1):
function dynamic_range_compression_torch (line 30) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
function dynamic_range_decompression_torch (line 34) | def dynamic_range_decompression_torch(x, C=1):
function spectral_normalize_torch (line 38) | def spectral_normalize_torch(magnitudes):
function spectral_de_normalize_torch (line 43) | def spectral_de_normalize_torch(magnitudes):
function mel_spectrogram (line 52) | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_siz...
function get_dataset_filelist (line 92) | def get_dataset_filelist(a):
class MelDataset (line 105) | class MelDataset(torch.utils.data.Dataset):
method __init__ (line 106) | def __init__(
method __getitem__ (line 146) | def __getitem__(self, index):
method __len__ (line 216) | def __len__(self):
FILE: third_party/Matcha-TTS/matcha/hifigan/models.py
class ResBlock1 (line 14) | class ResBlock1(torch.nn.Module):
method __init__ (line 15) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
method forward (line 90) | def forward(self, x):
method remove_weight_norm (line 99) | def remove_weight_norm(self):
class ResBlock2 (line 106) | class ResBlock2(torch.nn.Module):
method __init__ (line 107) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
method forward (line 136) | def forward(self, x):
method remove_weight_norm (line 143) | def remove_weight_norm(self):
class Generator (line 148) | class Generator(torch.nn.Module):
method __init__ (line 149) | def __init__(self, h):
method forward (line 181) | def forward(self, x):
method remove_weight_norm (line 199) | def remove_weight_norm(self):
class DiscriminatorP (line 209) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 210) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
method forward (line 225) | def forward(self, x):
class MultiPeriodDiscriminator (line 247) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 248) | def __init__(self):
method forward (line 260) | def forward(self, y, y_hat):
class DiscriminatorS (line 276) | class DiscriminatorS(torch.nn.Module):
method __init__ (line 277) | def __init__(self, use_spectral_norm=False):
method forward (line 293) | def forward(self, x):
class MultiScaleDiscriminator (line 306) | class MultiScaleDiscriminator(torch.nn.Module):
method __init__ (line 307) | def __init__(self):
method forward (line 318) | def forward(self, y, y_hat):
function feature_loss (line 337) | def feature_loss(fmap_r, fmap_g):
function discriminator_loss (line 346) | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
function generator_loss (line 360) | def generator_loss(disc_outputs):
FILE: third_party/Matcha-TTS/matcha/hifigan/xutils.py
function plot_spectrogram (line 14) | def plot_spectrogram(spectrogram):
function init_weights (line 25) | def init_weights(m, mean=0.0, std=0.01):
function apply_weight_norm (line 31) | def apply_weight_norm(m):
function get_padding (line 37) | def get_padding(kernel_size, dilation=1):
function load_checkpoint (line 41) | def load_checkpoint(filepath, device):
function save_checkpoint (line 49) | def save_checkpoint(filepath, obj):
function scan_checkpoint (line 55) | def scan_checkpoint(cp_dir, prefix):
FILE: third_party/Matcha-TTS/matcha/models/baselightningmodule.py
class BaseLightningClass (line 19) | class BaseLightningClass(LightningModule, ABC):
method update_data_statistics (line 20) | def update_data_statistics(self, data_statistics):
method configure_optimizers (line 30) | def configure_optimizers(self) -> Any:
method get_losses (line 56) | def get_losses(self, batch):
method on_load_checkpoint (line 75) | def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
method training_step (line 78) | def training_step(self, batch: Any, batch_idx: int):
method validation_step (line 127) | def validation_step(self, batch: Any, batch_idx: int):
method on_validation_end (line 167) | def on_validation_end(self) -> None:
method on_before_optimizer_step (line 208) | def on_before_optimizer_step(self, optimizer):
FILE: third_party/Matcha-TTS/matcha/models/components/decoder.py
class SinusoidalPosEmb (line 14) | class SinusoidalPosEmb(torch.nn.Module):
method __init__ (line 15) | def __init__(self, dim):
method forward (line 20) | def forward(self, x, scale=1000):
class Block1D (line 32) | class Block1D(torch.nn.Module):
method __init__ (line 33) | def __init__(self, dim, dim_out, groups=8):
method forward (line 41) | def forward(self, x, mask):
class ResnetBlock1D (line 46) | class ResnetBlock1D(torch.nn.Module):
method __init__ (line 47) | def __init__(self, dim, dim_out, time_emb_dim, groups=8):
method forward (line 56) | def forward(self, x, mask, time_emb):
class Downsample1D (line 64) | class Downsample1D(nn.Module):
method __init__ (line 65) | def __init__(self, dim):
method forward (line 69) | def forward(self, x):
class TimestepEmbedding (line 73) | class TimestepEmbedding(nn.Module):
method __init__ (line 74) | def __init__(
method forward (line 105) | def forward(self, sample, condition=None):
class Upsample1D (line 120) | class Upsample1D(nn.Module):
method __init__ (line 134) | def __init__(self, channels, use_conv=False, use_conv_transpose=True, ...
method forward (line 148) | def forward(self, inputs):
class ConformerWrapper (line 161) | class ConformerWrapper(ConformerBlock):
method __init__ (line 162) | def __init__( # pylint: disable=useless-super-delegation
method forward (line 189) | def forward(
class Decoder (line 200) | class Decoder(nn.Module):
method __init__ (line 201) | def __init__(
method get_block (line 319) | def get_block(block_type, dim, attention_head_dim, num_heads, dropout,...
method initialize_weights (line 345) | def initialize_weights(self):
method forward (line 363) | def forward(self, x, mask, mu, t, spks=None, cond=None):
FILE: third_party/Matcha-TTS/matcha/models/components/flow_matching.py
class BASECFM (line 12) | class BASECFM(torch.nn.Module, ABC):
method __init__ (line 13) | def __init__(
method forward (line 33) | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, c...
method solve_euler (line 55) | def solve_euler(self, x, t_span, mu, mask, spks, cond):
method compute_loss (line 87) | def compute_loss(self, x1, mask, mu, spks=None, cond=None):
class CFM (line 121) | class CFM(BASECFM):
method __init__ (line 122) | def __init__(self, in_channels, out_channel, cfm_params, decoder_param...
FILE: third_party/Matcha-TTS/matcha/models/components/text_encoder.py
class LayerNorm (line 15) | class LayerNorm(nn.Module):
method __init__ (line 16) | def __init__(self, channels, eps=1e-4):
method forward (line 24) | def forward(self, x):
class ConvReluNorm (line 36) | class ConvReluNorm(nn.Module):
method __init__ (line 37) | def __init__(self, in_channels, hidden_channels, out_channels, kernel_...
method forward (line 60) | def forward(self, x, x_mask):
class DurationPredictor (line 70) | class DurationPredictor(nn.Module):
method __init__ (line 71) | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
method forward (line 84) | def forward(self, x, x_mask):
class RotaryPositionalEmbeddings (line 97) | class RotaryPositionalEmbeddings(nn.Module):
method __init__ (line 107) | def __init__(self, d: int, base: int = 10_000):
method _build_cache (line 119) | def _build_cache(self, x: torch.Tensor):
method _neg_half (line 147) | def _neg_half(self, x: torch.Tensor):
method forward (line 154) | def forward(self, x: torch.Tensor):
class MultiHeadAttention (line 175) | class MultiHeadAttention(nn.Module):
method __init__ (line 176) | def __init__(
method forward (line 216) | def forward(self, x, c, attn_mask=None):
method attention (line 226) | def attention(self, query, key, value, mask=None):
method _attention_bias_proximal (line 249) | def _attention_bias_proximal(length):
class FFN (line 255) | class FFN(nn.Module):
method __init__ (line 256) | def __init__(self, in_channels, out_channels, filter_channels, kernel_...
method forward (line 268) | def forward(self, x, x_mask):
class Encoder (line 276) | class Encoder(nn.Module):
method __init__ (line 277) | def __init__(
method forward (line 314) | def forward(self, x, x_mask):
class TextEncoder (line 328) | class TextEncoder(nn.Module):
method __init__ (line 329) | def __init__(
method forward (line 378) | def forward(self, x, x_lengths, spks=None):
FILE: third_party/Matcha-TTS/matcha/models/components/transformer.py
class SnakeBeta (line 17) | class SnakeBeta(nn.Module):
method __init__ (line 35) | def __init__(self, in_features, out_features, alpha=1.0, alpha_trainab...
method forward (line 64) | def forward(self, x):
class FeedForward (line 83) | class FeedForward(nn.Module):
method __init__ (line 96) | def __init__(
method forward (line 131) | def forward(self, hidden_states):
class BasicTransformerBlock (line 138) | class BasicTransformerBlock(nn.Module):
method __init__ (line 159) | def __init__(
method set_chunk_feed_forward (line 238) | def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
method forward (line 243) | def forward(
FILE: third_party/Matcha-TTS/matcha/models/matcha_tts.py
class MatchaTTS (line 23) | class MatchaTTS(BaseLightningClass): # 🍵
method __init__ (line 24) | def __init__(
method synthesise (line 74) | def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, spks=...
method forward (line 150) | def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None...
FILE: third_party/Matcha-TTS/matcha/onnx/export.py
class MatchaWithVocoder (line 22) | class MatchaWithVocoder(LightningModule):
method __init__ (line 23) | def __init__(self, matcha, vocoder):
method forward (line 28) | def forward(self, x, x_lengths, scales, spks=None):
function get_exportable_module (line 35) | def get_exportable_module(matcha, vocoder, n_timesteps):
function get_inputs (line 63) | def get_inputs(is_multi_speaker):
function main (line 91) | def main():
FILE: third_party/Matcha-TTS/matcha/onnx/infer.py
function validate_args (line 15) | def validate_args(args):
function write_wavs (line 24) | def write_wavs(model, inputs, output_dir, external_vocoder=None):
function write_mels (line 66) | def write_mels(model, inputs, output_dir):
function main (line 85) | def main():
FILE: third_party/Matcha-TTS/matcha/text/__init__.py
function text_to_sequence (line 10) | def text_to_sequence(text, cleaner_names):
function cleaned_text_to_sequence (line 27) | def cleaned_text_to_sequence(cleaned_text):
function sequence_to_text (line 38) | def sequence_to_text(sequence):
function _clean_text (line 47) | def _clean_text(text, cleaner_names):
FILE: third_party/Matcha-TTS/matcha/text/cleaners.py
function expand_abbreviations (line 66) | def expand_abbreviations(text):
function lowercase (line 72) | def lowercase(text):
function collapse_whitespace (line 76) | def collapse_whitespace(text):
function convert_to_ascii (line 80) | def convert_to_ascii(text):
function basic_cleaners (line 84) | def basic_cleaners(text):
function transliteration_cleaners (line 91) | def transliteration_cleaners(text):
function english_cleaners2 (line 99) | def english_cleaners2(text):
function english_cleaners_piper (line 109) | def english_cleaners_piper(text):
FILE: third_party/Matcha-TTS/matcha/text/numbers.py
function _remove_commas (line 16) | def _remove_commas(m):
function _expand_decimal_point (line 20) | def _expand_decimal_point(m):
function _expand_dollars (line 24) | def _expand_dollars(m):
function _expand_ordinal (line 45) | def _expand_ordinal(m):
function _expand_number (line 49) | def _expand_number(m):
function normalize_numbers (line 64) | def normalize_numbers(text):
FILE: third_party/Matcha-TTS/matcha/train.py
function train (line 35) | def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
function main (line 101) | def main(cfg: DictConfig) -> Optional[float]:
FILE: third_party/Matcha-TTS/matcha/utils/audio.py
function load_wav (line 10) | def load_wav(full_path):
function dynamic_range_compression (line 15) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
function dynamic_range_decompression (line 19) | def dynamic_range_decompression(x, C=1):
function dynamic_range_compression_torch (line 23) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
function dynamic_range_decompression_torch (line 27) | def dynamic_range_decompression_torch(x, C=1):
function spectral_normalize_torch (line 31) | def spectral_normalize_torch(magnitudes):
function spectral_de_normalize_torch (line 36) | def spectral_de_normalize_torch(magnitudes):
function mel_spectrogram (line 45) | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_siz...
FILE: third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py
function compute_data_statistics (line 25) | def compute_data_statistics(data_loader: torch.utils.data.DataLoader, ou...
function main (line 50) | def main():
FILE: third_party/Matcha-TTS/matcha/utils/instantiators.py
function instantiate_callbacks (line 13) | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
function instantiate_loggers (line 36) | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:
FILE: third_party/Matcha-TTS/matcha/utils/logging_utils.py
function log_hyperparameters (line 12) | def log_hyperparameters(object_dict: Dict[str, Any]) -> None:
FILE: third_party/Matcha-TTS/matcha/utils/model.py
function sequence_mask (line 7) | def sequence_mask(length, max_length=None):
function fix_len_compatibility (line 14) | def fix_len_compatibility(length, num_downsamplings_in_unet=2):
function convert_pad_shape (line 23) | def convert_pad_shape(pad_shape):
function generate_path (line 29) | def generate_path(duration, mask):
function duration_loss (line 44) | def duration_loss(logw, logw_, lengths):
function normalize (line 49) | def normalize(data, mu, std):
function denormalize (line 71) | def denormalize(data, mu, std):
FILE: third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py
function maximum_path (line 7) | def maximum_path(value, mask):
FILE: third_party/Matcha-TTS/matcha/utils/monotonic_align/core.c
function CYTHON_INLINE (line 399) | static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int k, int l,...
type PyObject (line 484) | typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *co...
type PyObject (line 485) | typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, ...
type Py_tss_t (line 526) | typedef int Py_tss_t;
function CYTHON_INLINE (line 527) | static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
function CYTHON_INLINE (line 531) | static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
function CYTHON_INLINE (line 536) | static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
function CYTHON_INLINE (line 539) | static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
function CYTHON_INLINE (line 542) | static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
function CYTHON_INLINE (line 546) | static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
function CYTHON_INLINE (line 549) | static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
type Py_hash_t (line 694) | typedef long Py_hash_t;
type __Pyx_PyAsyncMethodsStruct (line 717) | typedef struct {
function CYTHON_INLINE (line 733) | static CYTHON_INLINE float __PYX_NAN() {
type __Pyx_StringTabEntry (line 782) | typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const c...
function CYTHON_INLINE (line 803) | static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t l...
function CYTHON_INLINE (line 852) | static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
function __Pyx_init_sys_getdefaultencoding_params (line 885) | static int __Pyx_init_sys_getdefaultencoding_params(void) {
function __Pyx_init_sys_getdefaultencoding_params (line 935) | static int __Pyx_init_sys_getdefaultencoding_params(void) {
function CYTHON_INLINE (line 967) | static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void...
type __pyx_memoryview_obj (line 1018) | struct __pyx_memoryview_obj
type __Pyx_memviewslice (line 1019) | typedef struct {
type __pyx_atomic_int_type (line 1060) | typedef volatile __pyx_atomic_int_type __pyx_atomic_int;
type __Pyx_StructField_ (line 1080) | struct __Pyx_StructField_
type __Pyx_TypeInfo (line 1082) | typedef struct {
type __Pyx_StructField (line 1092) | typedef struct __Pyx_StructField_ {
type __Pyx_BufFmt_StackElem (line 1097) | typedef struct {
type __Pyx_BufFmt_Context (line 1101) | typedef struct {
type npy_int8 (line 1122) | typedef npy_int8 __pyx_t_5numpy_int8_t;
type npy_int16 (line 1131) | typedef npy_int16 __pyx_t_5numpy_int16_t;
type npy_int32 (line 1140) | typedef npy_int32 __pyx_t_5numpy_int32_t;
type npy_int64 (line 1149) | typedef npy_int64 __pyx_t_5numpy_int64_t;
type npy_uint8 (line 1158) | typedef npy_uint8 __pyx_t_5numpy_uint8_t;
type npy_uint16 (line 1167) | typedef npy_uint16 __pyx_t_5numpy_uint16_t;
type npy_uint32 (line 1176) | typedef npy_uint32 __pyx_t_5numpy_uint32_t;
type npy_uint64 (line 1185) | typedef npy_uint64 __pyx_t_5numpy_uint64_t;
type npy_float32 (line 1194) | typedef npy_float32 __pyx_t_5numpy_float32_t;
type npy_float64 (line 1203) | typedef npy_float64 __pyx_t_5numpy_float64_t;
type npy_long (line 1212) | typedef npy_long __pyx_t_5numpy_int_t;
type npy_longlong (line 1221) | typedef npy_longlong __pyx_t_5numpy_long_t;
type npy_longlong (line 1230) | typedef npy_longlong __pyx_t_5numpy_longlong_t;
type npy_ulong (line 1239) | typedef npy_ulong __pyx_t_5numpy_uint_t;
type npy_ulonglong (line 1248) | typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
type npy_ulonglong (line 1257) | typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
type npy_intp (line 1266) | typedef npy_intp __pyx_t_5numpy_intp_t;
type npy_uintp (line 1275) | typedef npy_uintp __pyx_t_5numpy_uintp_t;
type npy_double (line 1284) | typedef npy_double __pyx_t_5numpy_float_t;
type npy_double (line 1293) | typedef npy_double __pyx_t_5numpy_double_t;
type npy_longdouble (line 1302) | typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
type std (line 1306) | typedef ::std::complex< float > __pyx_t_float_complex;
type __pyx_t_float_complex (line 1308) | typedef float _Complex __pyx_t_float_complex;
type __pyx_t_float_complex (line 1311) | typedef struct { float real, imag; } __pyx_t_float_complex;
type std (line 1318) | typedef ::std::complex< double > __pyx_t_double_complex;
type __pyx_t_double_complex (line 1320) | typedef double _Complex __pyx_t_double_complex;
type __pyx_t_double_complex (line 1323) | typedef struct { double real, imag; } __pyx_t_double_complex;
type __pyx_array_obj (line 1329) | struct __pyx_array_obj
type __pyx_MemviewEnum_obj (line 1330) | struct __pyx_MemviewEnum_obj
type __pyx_memoryview_obj (line 1331) | struct __pyx_memoryview_obj
type __pyx_memoryviewslice_obj (line 1332) | struct __pyx_memoryviewslice_obj
type npy_cfloat (line 1341) | typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
type npy_cdouble (line 1350) | typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
type npy_clongdouble (line 1359) | typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
type npy_cdouble (line 1368) | typedef npy_cdouble __pyx_t_5numpy_complex_t;
type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 1369) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c
type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 1378) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_pat...
type __pyx_array_obj (line 1390) | struct __pyx_array_obj {
type __pyx_MemviewEnum_obj (line 1415) | struct __pyx_MemviewEnum_obj {
type __pyx_memoryview_obj (line 1428) | struct __pyx_memoryview_obj {
type __pyx_memoryviewslice_obj (line 1451) | struct __pyx_memoryviewslice_obj {
type __pyx_vtabstruct_array (line 1469) | struct __pyx_vtabstruct_array {
type __pyx_vtabstruct_array (line 1472) | struct __pyx_vtabstruct_array
type __pyx_vtabstruct_memoryview (line 1483) | struct __pyx_vtabstruct_memoryview {
type __pyx_vtabstruct_memoryview (line 1492) | struct __pyx_vtabstruct_memoryview
type __pyx_vtabstruct__memoryviewslice (line 1503) | struct __pyx_vtabstruct__memoryviewslice {
type __pyx_vtabstruct__memoryviewslice (line 1506) | struct __pyx_vtabstruct__memoryviewslice
type __Pyx_RefNannyAPIStruct (line 1514) | typedef struct {
type __pyx_memoryview_obj (line 1593) | struct __pyx_memoryview_obj
type __pyx_array_obj (line 1780) | struct __pyx_array_obj
function CYTHON_INLINE (line 1814) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16(const char *s...
function CYTHON_INLINE (line 1818) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16LE(const char ...
function CYTHON_INLINE (line 1822) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16BE(const char ...
function CYTHON_INLINE (line 1922) | static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject*...
function CYTHON_INLINE (line 1946) | static CYTHON_INLINE int __Pyx_PyList_Extend(PyObject* L, PyObject* v) {
function CYTHON_INLINE (line 1960) | static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
function CYTHON_INLINE (line 1979) | static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyO...
type __Pyx_ImportType_CheckSize_0_29_35 (line 2024) | enum __Pyx_ImportType_CheckSize_0_29_35 {
type __Pyx_ImportType_CheckSize_0_29_35 (line 2029) | enum __Pyx_ImportType_CheckSize_0_29_35
type __Pyx_CodeObjectCacheEntry (line 2040) | typedef struct {
type __Pyx_CodeObjectCache (line 2044) | struct __Pyx_CodeObjectCache {
type __Pyx_CodeObjectCache (line 2049) | struct __Pyx_CodeObjectCache
type __Pyx_Buf_DimInfo (line 2068) | typedef struct {
type __Pyx_Buffer (line 2071) | typedef struct {
type __Pyx_LocalBuf_ND (line 2075) | typedef struct {
type __pyx_array_obj (line 2255) | struct __pyx_array_obj
type __pyx_memoryview_obj (line 2256) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2257) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2258) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2259) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2259) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2260) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2261) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2262) | struct __pyx_memoryview_obj
type __pyx_memoryviewslice_obj (line 2263) | struct __pyx_memoryviewslice_obj
type __pyx_memoryviewslice_obj (line 2264) | struct __pyx_memoryviewslice_obj
type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 2321) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c
type __pyx_array_obj (line 2322) | struct __pyx_array_obj
type __pyx_memoryview_obj (line 2328) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2328) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2333) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2334) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2335) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2336) | struct __pyx_memoryview_obj
type __pyx_MemviewEnum_obj (line 2354) | struct __pyx_MemviewEnum_obj
type __pyx_array_obj (line 2562) | struct __pyx_array_obj
type __pyx_array_obj (line 2563) | struct __pyx_array_obj
type __pyx_array_obj (line 2564) | struct __pyx_array_obj
type __pyx_array_obj (line 2565) | struct __pyx_array_obj
type __pyx_array_obj (line 2566) | struct __pyx_array_obj
type __pyx_array_obj (line 2567) | struct __pyx_array_obj
type __pyx_array_obj (line 2568) | struct __pyx_array_obj
type __pyx_array_obj (line 2569) | struct __pyx_array_obj
type __pyx_MemviewEnum_obj (line 2572) | struct __pyx_MemviewEnum_obj
type __pyx_MemviewEnum_obj (line 2573) | struct __pyx_MemviewEnum_obj
type __pyx_MemviewEnum_obj (line 2574) | struct __pyx_MemviewEnum_obj
type __pyx_MemviewEnum_obj (line 2575) | struct __pyx_MemviewEnum_obj
type __pyx_memoryview_obj (line 2576) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2577) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2578) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2579) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2580) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2581) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2582) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2583) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2584) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2585) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2586) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2587) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2588) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2589) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2590) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2591) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2592) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2593) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2594) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2595) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 2596) | struct __pyx_memoryview_obj
type __pyx_memoryviewslice_obj (line 2599) | struct __pyx_memoryviewslice_obj
type __pyx_memoryviewslice_obj (line 2600) | struct __pyx_memoryviewslice_obj
function __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_each (line 2653) | static void __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_...
function __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 2951) | static void __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_...
function PyObject (line 3105) | static PyObject *__pyx_pw_6matcha_5utils_15monotonic_align_4core_1maximu...
function PyObject (line 3207) | static PyObject *__pyx_pf_6matcha_5utils_15monotonic_align_4core_maximum...
function CYTHON_INLINE (line 3253) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyOb...
function CYTHON_INLINE (line 3303) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyOb...
function CYTHON_INLINE (line 3353) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyOb...
function CYTHON_INLINE (line 3403) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyOb...
function CYTHON_INLINE (line 3453) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyOb...
function CYTHON_INLINE (line 3503) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_D...
function CYTHON_INLINE (line 3577) | static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *_...
function CYTHON_INLINE (line 3619) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObje...
function CYTHON_INLINE (line 3700) | static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
function CYTHON_INLINE (line 3832) | static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
function CYTHON_INLINE (line 3964) | static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
function CYTHON_INLINE (line 4096) | static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *...
function CYTHON_INLINE (line 4133) | static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *_...
function CYTHON_INLINE (line 4170) | static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(Py...
function CYTHON_INLINE (line 4204) | static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(...
function CYTHON_INLINE (line 4238) | static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit...
function __pyx_array___cinit__ (line 4272) | static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx...
function __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__ (line 4400) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(s...
function CYTHON_UNUSED (line 5023) | static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, P...
function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__ (line 5034) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffe...
function __pyx_array___dealloc__ (line 5330) | static void __pyx_array___dealloc__(PyObject *__pyx_v_self) {
function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__ (line 5339) | static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc...
function PyObject (line 5461) | static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__...
function PyObject (line 5472) | static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(...
function PyObject (line 5522) | static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v...
function Py_ssize_t (line 5604) | static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self) {
function Py_ssize_t (line 5615) | static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__l...
function PyObject (line 5654) | static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObjec...
function PyObject (line 5665) | static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__ge...
function PyObject (line 5722) | static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObjec...
function PyObject (line 5733) | static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__g...
function __pyx_array___setitem__ (line 5790) | static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__p...
function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__ (line 5801) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem...
function PyObject (line 5850) | static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx...
function PyObject (line 5861) | static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED st...
function PyObject (line 5907) | static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__p...
function PyObject (line 5918) | static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED...
type __pyx_array_obj (line 5963) | struct __pyx_array_obj
type __pyx_array_obj (line 5964) | struct __pyx_array_obj
type __pyx_array_obj (line 5965) | struct __pyx_array_obj
type __pyx_array_obj (line 6017) | struct __pyx_array_obj
type __pyx_array_obj (line 6081) | struct __pyx_array_obj
function __pyx_MemviewEnum___init__ (line 6140) | static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *...
function __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__ (line 6191) | static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init...
function PyObject (line 6233) | static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self) {
function PyObject (line 6244) | static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_...
function PyObject (line 6284) | static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject ...
function PyObject (line 6295) | static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __p...
function PyObject (line 6519) | static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObjec...
function PyObject (line 6530) | static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct ...
function __pyx_memoryview___cinit__ (line 6662) | static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *...
function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__ (line 6742) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
function __pyx_memoryview___dealloc__ (line 7060) | static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self) {
function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__ (line 7069) | static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview...
type __pyx_memoryview_obj (line 7289) | struct __pyx_memoryview_obj
function PyObject (line 7429) | static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, Py...
function PyObject (line 7440) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function __pyx_memoryview___setitem__ (line 7618) | static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject...
function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__ (line 7629) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
function PyObject (line 7844) | static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *...
function PyObject (line 8054) | static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_...
function PyObject (line 8144) | static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __p...
function PyObject (line 8434) | static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryvie...
function PyObject (line 8495) | static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_me...
function PyObject (line 8772) | static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_m...
function CYTHON_UNUSED (line 9013) | static CYTHON_UNUSED int __pyx_memoryview_getbuffer(PyObject *__pyx_v_se...
function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__ (line 9024) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
function PyObject (line 9357) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__...
function PyObject (line 9368) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(...
function PyObject (line 9443) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__ge...
function PyObject (line 9454) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get...
function PyObject (line 9496) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__g...
function PyObject (line 9507) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___ge...
function PyObject (line 9577) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1_...
function PyObject (line 9588) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___...
function PyObject (line 9691) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffset...
function PyObject (line 9702) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffset...
function PyObject (line 9809) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__ge...
function PyObject (line 9820) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get...
function PyObject (line 9872) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1...
function PyObject (line 9883) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize__...
function PyObject (line 9935) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__...
function PyObject (line 9946) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___g...
function PyObject (line 10008) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__ge...
function PyObject (line 10019) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get...
function Py_ssize_t (line 10149) | static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self) {
function Py_ssize_t (line 10160) | static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memo...
function PyObject (line 10229) | static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self) {
function PyObject (line 10240) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10331) | static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self) {
function PyObject (line 10342) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10410) | static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, CY...
function PyObject (line 10421) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10486) | static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, CY...
function PyObject (line 10497) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10562) | static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, CYTHON_UN...
function PyObject (line 10573) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10656) | static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, C...
function PyObject (line 10667) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
function PyObject (line 10749) | static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *...
function PyObject (line 10760) | static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUS...
function PyObject (line 10806) | static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject...
function PyObject (line 10817) | static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_U...
function PyObject (line 10862) | static PyObject *__pyx_memoryview_new(PyObject *__pyx_v_o, int __pyx_v_f...
function CYTHON_INLINE (line 10953) | static CYTHON_INLINE int __pyx_memoryview_check(PyObject *__pyx_v_o) {
function PyObject (line 10992) | static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
function PyObject (line 11449) | static PyObject *assert_direct_dimensions(Py_ssize_t *__pyx_v_suboffsets...
type __pyx_memoryview_obj (line 11537) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 11537) | struct __pyx_memoryview_obj
type __pyx_memoryviewslice_obj (line 11544) | struct __pyx_memoryviewslice_obj
type __pyx_memoryview_obj (line 11554) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 11559) | struct __pyx_memoryview_obj
type __pyx_memoryviewslice_obj (line 11629) | struct __pyx_memoryviewslice_obj
type __pyx_memoryview_obj (line 12041) | struct __pyx_memoryview_obj
type __pyx_memoryview_obj (line 12082) | struct __pyx_memoryview_obj
function __pyx_memoryview_slice_memviewslice (line 12117) | static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *__pyx...
function __pyx_memslice_transpose (line 13210) | static int __pyx_memslice_transpose(__Pyx_memviewslice *__pyx_v_memslice) {
function __pyx_memoryviewslice___dealloc__ (line 13386) | static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self) {
function __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__ (line 13395) | static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memo...
function PyObject (line 13428) | static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __p...
function PyObject (line 13514) | static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __...
function PyObject (line 13599) | static PyObject *__pyx_pw_15View_dot_MemoryView_16_memoryviewslice_4base...
function PyObject (line 13610) | static PyObject *__pyx_pf_15View_dot_MemoryView_16_memoryviewslice_4base...
function PyObject (line 13650) | static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObj...
function PyObject (line 13661) | static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON...
function PyObject (line 13707) | static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyO...
function PyObject (line 13718) | static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYT...
function PyObject (line 13763) | static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice __pyx_v_m...
function __Pyx_memviewslice (line 14149) | static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(st...
function __pyx_memoryview_slice_copy (line 14252) | static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *__p...
function PyObject (line 14378) | static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_ob...
function PyObject (line 14438) | static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_me...
function Py_ssize_t (line 14564) | static Py_ssize_t abs_py_ssize_t(Py_ssize_t __pyx_v_arg) {
function __pyx_get_best_slice_order (line 14630) | static char __pyx_get_best_slice_order(__Pyx_memviewslice *__pyx_v_mslic...
function _copy_strided_to_strided (line 14820) | static void _copy_strided_to_strided(char *__pyx_v_src_data, Py_ssize_t ...
function copy_strided_to_strided (line 15057) | static void copy_strided_to_strided(__Pyx_memviewslice *__pyx_v_src, __P...
function Py_ssize_t (line 15087) | static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *__...
function Py_ssize_t (line 15159) | static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *__pyx_v_sh...
type __pyx_memoryview_obj (line 15290) | struct __pyx_memoryview_obj
function __pyx_memoryview_err_extents (line 15536) | static int __pyx_memoryview_err_extents(int __pyx_v_i, Py_ssize_t __pyx_...
function __pyx_memoryview_err_dim (line 15624) | static int __pyx_memoryview_err_dim(PyObject *__pyx_v_error, char *__pyx...
function __pyx_memoryview_err (line 15708) | static int __pyx_memoryview_err(PyObject *__pyx_v_error, char *__pyx_v_m...
function __pyx_memoryview_copy_contents (line 15818) | static int __pyx_memoryview_copy_contents(__Pyx_memviewslice __pyx_v_src...
function __pyx_memoryview_broadcast_leading (line 16397) | static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *__pyx...
function __pyx_memoryview_refcount_copying (line 16510) | static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *__pyx_...
function __pyx_memoryview_refcount_objects_in_slice_with_gil (line 16560) | static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *__...
function __pyx_memoryview_refcount_objects_in_slice (line 16599) | static void __pyx_memoryview_refcount_objects_in_slice(char *__pyx_v_dat...
function __pyx_memoryview_slice_assign_scalar (line 16731) | static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *__p...
function __pyx_memoryview__slice_assign_scalar (line 16779) | static void __pyx_memoryview__slice_assign_scalar(char *__pyx_v_data, Py...
function PyObject (line 16911) | static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyO...
function PyObject (line 16984) | static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTH...
function PyObject (line 17179) | static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum...
type __pyx_vtabstruct_array (line 17302) | struct __pyx_vtabstruct_array
function PyObject (line 17304) | static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObje...
function __pyx_tp_dealloc_array (line 17324) | static void __pyx_tp_dealloc_array(PyObject *o) {
function PyObject (line 17343) | static PyObject *__pyx_sq_item_array(PyObject *o, Py_ssize_t i) {
function __pyx_mp_ass_subscript_array (line 17351) | static int __pyx_mp_ass_subscript_array(PyObject *o, PyObject *i, PyObje...
function PyObject (line 17362) | static PyObject *__pyx_tp_getattro_array(PyObject *o, PyObject *n) {
function PyObject (line 17371) | static PyObject *__pyx_getprop___pyx_array_memview(PyObject *o, CYTHON_U...
type PyGetSetDef (line 17382) | struct PyGetSetDef
type __pyx_array_obj (line 17426) | struct __pyx_array_obj
function PyObject (line 17495) | static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, CYTHON_UNUSED PyObje...
function __pyx_tp_dealloc_Enum (line 17509) | static void __pyx_tp_dealloc_Enum(PyObject *o) {
function __pyx_tp_traverse_Enum (line 17521) | static int __pyx_tp_traverse_Enum(PyObject *o, visitproc v, void *a) {
function __pyx_tp_clear_Enum (line 17530) | static int __pyx_tp_clear_Enum(PyObject *o) {
type __pyx_MemviewEnum_obj (line 17548) | struct __pyx_MemviewEnum_obj
type __pyx_vtabstruct_memoryview (line 17616) | struct __pyx_vtabstruct_memoryview
function PyObject (line 17618) | static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, P...
function __pyx_tp_dealloc_memoryview (line 17640) | static void __pyx_tp_dealloc_memoryview(PyObject *o) {
function __pyx_tp_traverse_memoryview (line 17662) | static int __pyx_tp_traverse_memoryview(PyObject *o, visitproc v, void *...
function __pyx_tp_clear_memoryview (line 17680) | static int __pyx_tp_clear_memoryview(PyObject *o) {
function PyObject (line 17695) | static PyObject *__pyx_sq_item_memoryview(PyObject *o, Py_ssize_t i) {
function __pyx_mp_ass_subscript_memoryview (line 17703) | static int __pyx_mp_ass_subscript_memoryview(PyObject *o, PyObject *i, P...
function PyObject (line 17714) | static PyObject *__pyx_getprop___pyx_memoryview_T(PyObject *o, CYTHON_UN...
function PyObject (line 17718) | static PyObject *__pyx_getprop___pyx_memoryview_base(PyObject *o, CYTHON...
function PyObject (line 17722) | static PyObject *__pyx_getprop___pyx_memoryview_shape(PyObject *o, CYTHO...
function PyObject (line 17726) | static PyObject *__pyx_getprop___pyx_memoryview_strides(PyObject *o, CYT...
function PyObject (line 17730) | static PyObject *__pyx_getprop___pyx_memoryview_suboffsets(PyObject *o, ...
function PyObject (line 17734) | static PyObject *__pyx_getprop___pyx_memoryview_ndim(PyObject *o, CYTHON...
function PyObject (line 17738) | static PyObject *__pyx_getprop___pyx_memoryview_itemsize(PyObject *o, CY...
function PyObject (line 17742) | static PyObject *__pyx_getprop___pyx_memoryview_nbytes(PyObject *o, CYTH...
function PyObject (line 17746) | static PyObject *__pyx_getprop___pyx_memoryview_size(PyObject *o, CYTHON...
type PyGetSetDef (line 17760) | struct PyGetSetDef
type __pyx_memoryview_obj (line 17812) | struct __pyx_memoryview_obj
type __pyx_vtabstruct__memoryviewslice (line 17880) | struct __pyx_vtabstruct__memoryviewslice
function PyObject (line 17882) | static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject...
function __pyx_tp_dealloc__memoryviewslice (line 17893) | static void __pyx_tp_dealloc__memoryviewslice(PyObject *o) {
function __pyx_tp_traverse__memoryviewslice (line 17914) | static int __pyx_tp_traverse__memoryviewslice(PyObject *o, visitproc v, ...
function __pyx_tp_clear__memoryviewslice (line 17924) | static int __pyx_tp_clear__memoryviewslice(PyObject *o) {
function PyObject (line 17935) | static PyObject *__pyx_getprop___pyx_memoryviewslice_base(PyObject *o, C...
type PyGetSetDef (line 17945) | struct PyGetSetDef
type __pyx_memoryviewslice_obj (line 17953) | struct __pyx_memoryviewslice_obj
type PyModuleDef (line 18046) | struct PyModuleDef
function CYTHON_SMALL_CODE (line 18175) | static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
function CYTHON_SMALL_CODE (line 18190) | static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
function CYTHON_SMALL_CODE (line 18482) | static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
function __Pyx_modinit_global_init_code (line 18510) | static int __Pyx_modinit_global_init_code(void) {
function __Pyx_modinit_variable_export_code (line 18523) | static int __Pyx_modinit_variable_export_code(void) {
function __Pyx_modinit_function_export_code (line 18531) | static int __Pyx_modinit_function_export_code(void) {
function __Pyx_modinit_type_init_code (line 18539) | static int __Pyx_modinit_type_init_code(void) {
function __Pyx_modinit_type_import_code (line 18604) | static int __Pyx_modinit_type_import_code(void) {
function __Pyx_modinit_variable_import_code (line 18648) | static int __Pyx_modinit_variable_import_code(void) {
function __Pyx_modinit_function_import_code (line 18656) | static int __Pyx_modinit_function_import_code(void) {
function __Pyx_PyMODINIT_FUNC (line 18687) | __Pyx_PyMODINIT_FUNC PyInit_core(void)
function CYTHON_SMALL_CODE (line 18692) | static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
function CYTHON_SMALL_CODE (line 18715) | static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, P...
function CYTHON_SMALL_CODE (line 18730) | static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CY...
function __Pyx_RefNannyAPIStruct (line 19075) | static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modn...
function CYTHON_INLINE (line 19092) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, ...
function PyObject (line 19105) | static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
function __Pyx_init_memviewslice (line 19119) | static int
function __pyx_fatalerror (line 19171) | static void __pyx_fatalerror(const char *fmt, ...) Py_NO_RETURN {
function CYTHON_INLINE (line 19183) | static CYTHON_INLINE int
function CYTHON_INLINE (line 19193) | static CYTHON_INLINE int
function CYTHON_INLINE (line 19203) | static CYTHON_INLINE void
function CYTHON_INLINE (line 19224) | static CYTHON_INLINE void __Pyx_XDEC_MEMVIEW(__Pyx_memviewslice *memslice,
function __Pyx_RaiseArgtupleInvalid (line 19251) | static void __Pyx_RaiseArgtupleInvalid(
function __Pyx_RaiseDoubleKeywordsError (line 19277) | static void __Pyx_RaiseDoubleKeywordsError(
function __Pyx_ParseOptionalKeywords (line 19291) | static int __Pyx_ParseOptionalKeywords(
function CYTHON_INLINE (line 19393) | static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varna...
function _PyErr_StackItem (line 19399) | static _PyErr_StackItem *
function CYTHON_INLINE (line 19414) | static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, Py...
function CYTHON_INLINE (line 19429) | static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, P...
function __Pyx_PyErr_ExceptionMatchesTuple (line 19455) | static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObjec...
function CYTHON_INLINE (line 19468) | static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadSta...
function __Pyx_GetException (line 19482) | static int __Pyx_GetException(PyObject **type, PyObject **value, PyObjec...
function CYTHON_INLINE (line 19554) | static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObj...
function CYTHON_INLINE (line 19574) | static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate,...
function CYTHON_INLINE (line 19586) | static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, P...
function __Pyx_Raise (line 19598) | static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
function __Pyx_Raise (line 19649) | static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, P...
function __Pyx__ArgTypeTest (line 19756) | static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const c...
function CYTHON_INLINE (line 19778) | static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *fun...
function PyObject (line 19801) | static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObjec...
function CYTHON_UNUSED (line 19919) | static CYTHON_UNUSED PyObject* __Pyx_PyObject_Call2Args(PyObject* functi...
function CYTHON_INLINE (line 19949) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, ...
function PyObject (line 19969) | static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *ar...
function CYTHON_INLINE (line 19979) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func,...
function CYTHON_INLINE (line 19997) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func,...
function CYTHON_INLINE (line 20008) | static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2...
function CYTHON_INLINE (line 20055) | static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* ...
function CYTHON_INLINE (line 20157) | static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ss...
function CYTHON_INLINE (line 20165) | static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) {
function PyObject (line 20178) | static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
function CYTHON_INLINE (line 20185) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, P...
function CYTHON_INLINE (line 20203) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, ...
function CYTHON_INLINE (line 20221) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssi...
function PyObject (line 20266) | static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject* index) {
function PyObject (line 20284) | static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key) {
function CYTHON_INLINE (line 20294) | static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
function PyObject (line 20327) | static PyObject *__Pyx_GetAttr3Default(PyObject *d) {
function CYTHON_INLINE (line 20336) | static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, ...
function CYTHON_INLINE (line 20343) | static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
function CYTHON_INLINE (line 20347) | static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject ...
function CYTHON_INLINE (line 20359) | static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj...
function CYTHON_INLINE (line 20371) | static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name)
function CYTHON_INLINE (line 20403) | static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expec...
function CYTHON_INLINE (line 20409) | static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t inde...
function CYTHON_INLINE (line 20416) | static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
function CYTHON_INLINE (line 20421) | static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *typ...
function CYTHON_INLINE (line 20435) | static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, Py...
function CYTHON_INLINE (line 20458) | static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject ...
function PyObject (line 20469) | static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int l...
function __Pyx_InBases (line 20535) | static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
function CYTHON_INLINE (line 20543) | static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *...
function __Pyx_inner_PyErr_GivenExceptionMatches2 (line 20559) | static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObj...
function CYTHON_INLINE (line 20581) | static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObje...
function __Pyx_PyErr_GivenExceptionMatchesTuple (line 20589) | static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, Py...
function CYTHON_INLINE (line 20610) | static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err...
function CYTHON_INLINE (line 20622) | static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *er...
function PyObject (line 20635) | static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHO...
function __Pyx_div_long (line 20758) | static CYTHON_INLINE long __Pyx_div_long(long a, long b) {
function PyObject (line 20766) | static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
function CYTHON_INLINE (line 20780) | static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) {
function PyObject (line 20799) | static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, P...
function CYTHON_INLINE (line 20810) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObj...
function PyObject (line 20839) | static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* ...
function __Pyx_SetVtable (line 20848) | static int __Pyx_SetVtable(PyObject *dict, void *vtable) {
function __Pyx_PyObject_GetAttrStr_ClearAttributeError (line 20866) | static void __Pyx_PyObject_GetAttrStr_ClearAttributeError(void) {
function CYTHON_INLINE (line 20872) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject...
function __Pyx_setup_reduce_is_named (line 20888) | static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) {
function __Pyx_setup_reduce (line 20904) | static int __Pyx_setup_reduce(PyObject* type_obj) {
function PyTypeObject (line 20994) | static PyTypeObject *__Pyx_ImportType_0_29_35(PyObject *module, const ch...
function __Pyx_CLineForTraceback (line 21072) | static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, ...
function __pyx_bisect_code_objects (line 21113) | static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries...
function PyCodeObject (line 21134) | static PyCodeObject *__pyx_find_code_object(int code_line) {
function __pyx_insert_code_object (line 21148) | static void __pyx_insert_code_object(int code_line, PyCodeObject* code_o...
function PyCodeObject (line 21202) | static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
function __Pyx_AddTraceback (line 21260) | static void __Pyx_AddTraceback(const char *funcname, int c_line,
function __Pyx_GetBuffer (line 21300) | static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
function __Pyx_ReleaseBuffer (line 21307) | static void __Pyx_ReleaseBuffer(Py_buffer *view) {
function __pyx_memviewslice_is_contig (line 21322) | static int
function __pyx_get_array_memory_extents (line 21344) | static void
function __pyx_slices_overlap (line 21368) | static int
function CYTHON_INLINE (line 21380) | static CYTHON_INLINE PyObject *
function CYTHON_INLINE (line 21393) | static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
function __Pyx_BufFmt_Init (line 21404) | static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
function __Pyx_BufFmt_ParseNumber (line 21431) | static int __Pyx_BufFmt_ParseNumber(const char** ts) {
function __Pyx_BufFmt_ExpectNumber (line 21446) | static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
function __Pyx_BufFmt_RaiseUnexpectedChar (line 21453) | static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
function __Pyx_BufFmt_TypeCharToStandardSize (line 21482) | static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_comple...
function __Pyx_BufFmt_TypeCharToNativeSize (line 21500) | static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
type __Pyx_st_short (line 21519) | typedef struct { char c; short x; } __Pyx_st_short;
type __Pyx_st_int (line 21520) | typedef struct { char c; int x; } __Pyx_st_int;
type __Pyx_st_long (line 21521) | typedef struct { char c; long x; } __Pyx_st_long;
type __Pyx_st_float (line 21522) | typedef struct { char c; float x; } __Pyx_st_float;
type __Pyx_st_double (line 21523) | typedef struct { char c; double x; } __Pyx_st_double;
type __Pyx_st_longdouble (line 21524) | typedef struct { char c; long double x; } __Pyx_st_longdouble;
type __Pyx_st_void_p (line 21525) | typedef struct { char c; void *x; } __Pyx_st_void_p;
type __Pyx_st_longlong (line 21527) | typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
function __Pyx_BufFmt_TypeCharToAlignment (line 21529) | static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED in...
type __Pyx_pad_short (line 21551) | typedef struct { short x; char c; } __Pyx_pad_short;
type __Pyx_pad_int (line 21552) | typedef struct { int x; char c; } __Pyx_pad_int;
type __Pyx_pad_long (line 21553) | typedef struct { long x; char c; } __Pyx_pad_long;
type __Pyx_pad_float (line 21554) | typedef struct { float x; char c; } __Pyx_pad_float;
type __Pyx_pad_double (line 21555) | typedef struct { double x; char c; } __Pyx_pad_double;
type __Pyx_pad_longdouble (line 21556) | typedef struct { long double x; char c; } __Pyx_pad_longdouble;
type __Pyx_pad_void_p (line 21557) | typedef struct { void *x; char c; } __Pyx_pad_void_p;
type __Pyx_pad_longlong (line 21559) | typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
function __Pyx_BufFmt_TypeCharToPadding (line 21561) | static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int ...
function __Pyx_BufFmt_TypeCharToGroup (line 21579) | static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
function __Pyx_BufFmt_RaiseExpected (line 21600) | static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
function __Pyx_BufFmt_ProcessTypeChunk (line 21624) | static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
function PyObject (line 21726) | static PyObject *
function __pyx_typeinfo_cmp (line 21906) | static int
function __pyx_check_strides (line 21947) | static int
function __pyx_check_suboffsets (line 22000) | static int
function __pyx_verify_contig (line 22023) | static int
function __Pyx_ValidateAndInit_memviewslice (line 22052) | static int __Pyx_ValidateAndInit_memviewslice(
function CYTHON_INLINE (line 22128) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
function CYTHON_INLINE (line 22151) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
function CYTHON_INLINE (line 22174) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
function CYTHON_INLINE (line 22221) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
function CYTHON_INLINE (line 22225) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
function CYTHON_INLINE (line 22230) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
function CYTHON_INLINE (line 22241) | static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx...
function CYTHON_INLINE (line 22244) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_flo...
function CYTHON_INLINE (line 22250) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_fl...
function CYTHON_INLINE (line 22256) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_fl...
function CYTHON_INLINE (line 22263) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_fl...
function CYTHON_INLINE (line 22283) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_fl...
function CYTHON_INLINE (line 22294) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_flo...
function CYTHON_INLINE (line 22300) | static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
function CYTHON_INLINE (line 22303) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_fl...
function CYTHON_INLINE (line 22310) | static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
function CYTHON_INLINE (line 22317) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_flo...
function CYTHON_INLINE (line 22375) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
function CYTHON_INLINE (line 22379) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
function CYTHON_INLINE (line 22384) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
function CYTHON_INLINE (line 22395) | static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __p...
function CYTHON_INLINE (line 22398) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_d...
function CYTHON_INLINE (line 22404) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_...
function CYTHON_INLINE (line 22410) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_...
function CYTHON_INLINE (line 22417) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_...
function CYTHON_INLINE (line 22437) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_...
function CYTHON_INLINE (line 22448) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_d...
function CYTHON_INLINE (line 22454) | static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
function CYTHON_INLINE (line 22457) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_...
function CYTHON_INLINE (line 22464) | static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
function CYTHON_INLINE (line 22471) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_d...
function __Pyx_memviewslice (line 22527) | static __Pyx_memviewslice
function CYTHON_INLINE (line 22594) | static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
function CYTHON_INLINE (line 22828) | static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
function __Pyx_check_binary_version (line 23258) | static int __Pyx_check_binary_version(void) {
function __Pyx_InitStrings (line 23296) | static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
function CYTHON_INLINE (line 23328) | static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_...
function CYTHON_INLINE (line 23331) | static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
function CYTHON_INLINE (line 23358) | static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObjec...
function CYTHON_INLINE (line 23400) | static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
function CYTHON_INLINE (line 23405) | static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
function PyObject (line 23412) | static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* resul...
function CYTHON_INLINE (line 23481) | static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
function CYTHON_INLINE (line 23543) | static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
function CYTHON_INLINE (line 23560) | static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
function CYTHON_INLINE (line 23563) | static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
FILE: third_party/Matcha-TTS/matcha/utils/pylogger.py
function get_pylogger (line 6) | def get_pylogger(name: str = __name__) -> logging.Logger:
FILE: third_party/Matcha-TTS/matcha/utils/rich_utils.py
function print_config_tree (line 18) | def print_config_tree(
function enforce_tags (line 80) | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:
FILE: third_party/Matcha-TTS/matcha/utils/utils.py
function extras (line 20) | def extras(cfg: DictConfig) -> None:
function task_wrapper (line 51) | def task_wrapper(task_func: Callable) -> Callable:
function get_metric_value (line 106) | def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> f...
function intersperse (line 130) | def intersperse(lst, item):
function save_figure_to_numpy (line 137) | def save_figure_to_numpy(fig):
function plot_tensor (line 143) | def plot_tensor(tensor):
function save_plot (line 155) | def save_plot(tensor, savepath):
function to_numpy (line 166) | def to_numpy(tensor):
function get_user_data_dir (line 177) | def get_user_data_dir(appname="matcha_tts"):
function assert_model_downloaded (line 208) | def assert_model_downloaded(checkpoint_path, url, use_wget=True):
Condensed preview — 158 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (4,368K chars).
[
{
"path": ".dockerignore",
"chars": 991,
"preview": "# Git\n.git\n.gitignore\n.gitattributes\n\n\n# CI\n.codeclimate.yml\n.travis.yml\n.taskcluster.yml\n\n# Docker\ndocker-compose.yml\nD"
},
{
"path": "Dockerfile",
"chars": 659,
"preview": "FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04\nWORKDIR /breezyvoice\n\nENV UV_LINK_MODE=copy\nENV PATH=\"/root/.local/bi"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 7982,
"preview": "# BreezyVoice\n\nBreezyVoice is a voice-cloning text-to-speech system specifically adapted for Taiwanese Mandarin, highlig"
},
{
"path": "api.py",
"chars": 3655,
"preview": "# OpenAI API Spec. Reference: https://platform.openai.com/docs/api-reference/audio/createSpeech\n\nfrom contextlib import "
},
{
"path": "batch_inference.py",
"chars": 3421,
"preview": "import os\r\nimport time\r\nimport subprocess\r\nimport argparse\r\nimport pandas as pd\r\nfrom datasets import Dataset\r\nfrom sing"
},
{
"path": "compose.yaml",
"chars": 358,
"preview": "services:\n app:\n image: breezyvoice:latest\n build: .\n ports:\n - \"8080:8080\"\n volumes:\n - hf-cache"
},
{
"path": "cosyvoice/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "cosyvoice/bin/inference.py",
"chars": 5386,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/bin/train.py",
"chars": 5212,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/cli/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "cosyvoice/cli/cosyvoice.py",
"chars": 4209,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/cli/frontend.py",
"chars": 8999,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/cli/model.py",
"chars": 3660,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/dataset/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "cosyvoice/dataset/dataset.py",
"chars": 5233,
"preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n# 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licen"
},
{
"path": "cosyvoice/dataset/processor.py",
"chars": 13008,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
},
{
"path": "cosyvoice/flow/decoder.py",
"chars": 8878,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/flow/flow.py",
"chars": 5941,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/flow/flow_matching.py",
"chars": 5882,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/flow/length_regulator.py",
"chars": 1841,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/hifigan/f0_predictor.py",
"chars": 1976,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "cosyvoice/hifigan/generator.py",
"chars": 14736,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "cosyvoice/llm/llm.py",
"chars": 9184,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/transformer/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "cosyvoice/transformer/activation.py",
"chars": 3087,
"preview": "# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)\n# 2020 Northwestern Polytechnical Universi"
},
{
"path": "cosyvoice/transformer/attention.py",
"chars": 14196,
"preview": "# Copyright (c) 2019 Shigeki Karita\n# 2020 Mobvoi Inc (Binbin Zhang)\n# 2022 Xingchen Song (s"
},
{
"path": "cosyvoice/transformer/convolution.py",
"chars": 5230,
"preview": "# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n# 2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
},
{
"path": "cosyvoice/transformer/decoder.py",
"chars": 16591,
"preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n# 2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
},
{
"path": "cosyvoice/transformer/decoder_layer.py",
"chars": 4807,
"preview": "# Copyright (c) 2019 Shigeki Karita\n# 2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
},
{
"path": "cosyvoice/transformer/embedding.py",
"chars": 11316,
"preview": "# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n# 2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
},
{
"path": "cosyvoice/transformer/encoder.py",
"chars": 21401,
"preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)\n#"
},
{
"path": "cosyvoice/transformer/encoder_layer.py",
"chars": 9589,
"preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n# 2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)\n#"
},
{
"path": "cosyvoice/transformer/label_smoothing_loss.py",
"chars": 3459,
"preview": "# Copyright (c) 2019 Shigeki Karita\n# 2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
},
{
"path": "cosyvoice/transformer/positionwise_feed_forward.py",
"chars": 4219,
"preview": "# Copyright (c) 2019 Shigeki Karita\n# 2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
},
{
"path": "cosyvoice/transformer/subsampling.py",
"chars": 12666,
"preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n# 2024 Alibaba Inc (Xiang Lyu)\n#\n# Licensed under th"
},
{
"path": "cosyvoice/utils/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "cosyvoice/utils/class_utils.py",
"chars": 2582,
"preview": "# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>\n# 2024 Alibaba Inc (authors: Xiang Lyu)"
},
{
"path": "cosyvoice/utils/common.py",
"chars": 3414,
"preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n# 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under "
},
{
"path": "cosyvoice/utils/executor.py",
"chars": 5114,
"preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n# 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under "
},
{
"path": "cosyvoice/utils/file_utils.py",
"chars": 1839,
"preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n# 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licen"
},
{
"path": "cosyvoice/utils/frontend_utils.py",
"chars": 4000,
"preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
},
{
"path": "cosyvoice/utils/mask.py",
"chars": 8351,
"preview": "# Copyright (c) 2019 Shigeki Karita\n# 2020 Mobvoi Inc (Binbin Zhang)\n# 2024 Alibaba Inc (aut"
},
{
"path": "cosyvoice/utils/scheduler.py",
"chars": 24940,
"preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n# 2022 Ximalaya Inc (Yuguang Yang)\n# 2024 Ali"
},
{
"path": "cosyvoice/utils/train_utils.py",
"chars": 11863,
"preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n# 2023 Horizon Inc. (authors: Xingchen Song)\n# "
},
{
"path": "data/batch_files.csv",
"chars": 815,
"preview": "speaker_prompt_audio_filename,speaker,speaker_prompt_text_transcription,content_to_synthesize,output_audio_filename\nexam"
},
{
"path": "openai_api_inference.py",
"chars": 387,
"preview": "from pathlib import Path\n\nimport openai\n\nclient = openai.Client(base_url=\"http://localhost:8080\", api_key=\"sk-template\")"
},
{
"path": "requirements.txt",
"chars": 908,
"preview": "--extra-index-url https://download.pytorch.org/whl/cu118\nconformer==0.3.2\ndeepspeed==0.14.2; sys_platform == 'linux'\ndif"
},
{
"path": "results/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "run_batch_inference.sh",
"chars": 352,
"preview": "#!/bin/bash\n\n# Default parameters\nCSV_FILE=\"data/batch_files.csv\"\nSPEAKER_PROMPT_AUDIO_FOLDER=\"data\"\nOUTPUT_AUDIO_FOLDER"
},
{
"path": "run_single_inference.sh",
"chars": 269,
"preview": "python3 single_inference.py --speaker_prompt_audio_path \"data/example.wav\" --speaker_prompt_text_transcription \"在密碼學中,加密"
},
{
"path": "single_inference.py",
"chars": 20953,
"preview": "import argparse\r\nimport os\r\nimport sys\r\nimport re\r\nfrom functools import partial\r\nimport time\r\n\r\nimport torch\r\ntorch.set"
},
{
"path": "third_party/Matcha-TTS/LICENSE",
"chars": 1069,
"preview": "MIT License\n\nCopyright (c) 2023 Shivam Mehta\n\nPermission is hereby granted, free of charge, to any person obtaining a co"
},
{
"path": "third_party/Matcha-TTS/MANIFEST.in",
"chars": 352,
"preview": "include README.md\ninclude LICENSE.txt\ninclude requirements.*.txt\ninclude *.cff\ninclude requirements.txt\ninclude matcha/V"
},
{
"path": "third_party/Matcha-TTS/Makefile",
"chars": 1155,
"preview": "\nhelp: ## Show help\n\t@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = \":.*?## \"}; {printf \"\\033["
},
{
"path": "third_party/Matcha-TTS/README.md",
"chars": 8647,
"preview": "<div align=\"center\">\n\n# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching\n\n### [Shivam Mehta](https:/"
},
{
"path": "third_party/Matcha-TTS/configs/__init__.py",
"chars": 81,
"preview": "# this file is needed here to include configs when building project as a package\n"
},
{
"path": "third_party/Matcha-TTS/configs/callbacks/default.yaml",
"chars": 97,
"preview": "defaults:\n - model_checkpoint.yaml\n - model_summary.yaml\n - rich_progress_bar.yaml\n - _self_\n"
},
{
"path": "third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml",
"chars": 1199,
"preview": "# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html\n\nmodel_checkpoint:\n _ta"
},
{
"path": "third_party/Matcha-TTS/configs/callbacks/model_summary.yaml",
"chars": 252,
"preview": "# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html\n\nmodel_summary:\n _targ"
},
{
"path": "third_party/Matcha-TTS/configs/callbacks/none.yaml",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml",
"chars": 172,
"preview": "# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html\n\nrich_progress_bar:\n _t"
},
{
"path": "third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml",
"chars": 472,
"preview": "defaults:\n - ljspeech\n - _self_\n\n# Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/\n_target_: matc"
},
{
"path": "third_party/Matcha-TTS/configs/data/ljspeech.yaml",
"chars": 520,
"preview": "_target_: matcha.data.text_mel_datamodule.TextMelDataModule\nname: ljspeech\ntrain_filelist_path: data/filelists/ljs_audio"
},
{
"path": "third_party/Matcha-TTS/configs/data/vctk.yaml",
"chars": 385,
"preview": "defaults:\n - ljspeech\n - _self_\n\n_target_: matcha.data.text_mel_datamodule.TextMelDataModule\nname: vctk\ntrain_filelist"
},
{
"path": "third_party/Matcha-TTS/configs/debug/default.yaml",
"chars": 903,
"preview": "# @package _global_\n\n# default debugging setup, runs 1 full epoch\n# other debugging configs can inherit from this one\n\n#"
},
{
"path": "third_party/Matcha-TTS/configs/debug/fdr.yaml",
"chars": 120,
"preview": "# @package _global_\n\n# runs 1 train, 1 validation and 1 test step\n\ndefaults:\n - default\n\ntrainer:\n fast_dev_run: true\n"
},
{
"path": "third_party/Matcha-TTS/configs/debug/limit.yaml",
"chars": 218,
"preview": "# @package _global_\n\n# uses only 1% of the training data and 5% of validation/test data\n\ndefaults:\n - default\n\ntrainer:"
},
{
"path": "third_party/Matcha-TTS/configs/debug/overfit.yaml",
"chars": 204,
"preview": "# @package _global_\n\n# overfits to 3 batches\n\ndefaults:\n - default\n\ntrainer:\n max_epochs: 20\n overfit_batches: 3\n\n# m"
},
{
"path": "third_party/Matcha-TTS/configs/debug/profiler.yaml",
"chars": 225,
"preview": "# @package _global_\n\n# runs with execution time profiling\n\ndefaults:\n - default\n\ntrainer:\n max_epochs: 1\n # profiler:"
},
{
"path": "third_party/Matcha-TTS/configs/eval.yaml",
"chars": 335,
"preview": "# @package _global_\n\ndefaults:\n - _self_\n - data: mnist # choose datamodule with `test_dataloader()` for evaluation\n "
},
{
"path": "third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml",
"chars": 423,
"preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n - override"
},
{
"path": "third_party/Matcha-TTS/configs/experiment/ljspeech.yaml",
"chars": 332,
"preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n - override"
},
{
"path": "third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml",
"chars": 361,
"preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n - override"
},
{
"path": "third_party/Matcha-TTS/configs/experiment/multispeaker.yaml",
"chars": 336,
"preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n - override"
},
{
"path": "third_party/Matcha-TTS/configs/extras/default.yaml",
"chars": 232,
"preview": "# disable python warnings if they annoy you\nignore_warnings: False\n\n# ask user for tags if none are provided in the conf"
},
{
"path": "third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml",
"chars": 1818,
"preview": "# @package _global_\n\n# example hyperparameter optimization of some experiment with Optuna:\n# python train.py -m hparams_"
},
{
"path": "third_party/Matcha-TTS/configs/hydra/default.yaml",
"chars": 608,
"preview": "# https://hydra.cc/docs/configure_hydra/intro/\n\n# enable color logging\ndefaults:\n - override hydra_logging: colorlog\n "
},
{
"path": "third_party/Matcha-TTS/configs/local/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/configs/logger/aim.yaml",
"chars": 1267,
"preview": "# https://aimstack.io/\n\n# example usage in lightning module:\n# https://github.com/aimhubio/aim/blob/main/examples/pytorc"
},
{
"path": "third_party/Matcha-TTS/configs/logger/comet.yaml",
"chars": 372,
"preview": "# https://www.comet.ml\n\ncomet:\n _target_: lightning.pytorch.loggers.comet.CometLogger\n api_key: ${oc.env:COMET_API_TOK"
},
{
"path": "third_party/Matcha-TTS/configs/logger/csv.yaml",
"chars": 157,
"preview": "# csv logger built in lightning\n\ncsv:\n _target_: lightning.pytorch.loggers.csv_logs.CSVLogger\n save_dir: \"${paths.outp"
},
{
"path": "third_party/Matcha-TTS/configs/logger/many_loggers.yaml",
"chars": 118,
"preview": "# train with many loggers at once\n\ndefaults:\n # - comet\n - csv\n # - mlflow\n # - neptune\n - tensorboard\n - wandb\n"
},
{
"path": "third_party/Matcha-TTS/configs/logger/mlflow.yaml",
"chars": 339,
"preview": "# https://mlflow.org\n\nmlflow:\n _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger\n # experiment_name: \"\"\n # run_"
},
{
"path": "third_party/Matcha-TTS/configs/logger/neptune.yaml",
"chars": 277,
"preview": "# https://neptune.ai\n\nneptune:\n _target_: lightning.pytorch.loggers.neptune.NeptuneLogger\n api_key: ${oc.env:NEPTUNE_A"
},
{
"path": "third_party/Matcha-TTS/configs/logger/tensorboard.yaml",
"chars": 258,
"preview": "# https://www.tensorflow.org/tensorboard/\n\ntensorboard:\n _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLog"
},
{
"path": "third_party/Matcha-TTS/configs/logger/wandb.yaml",
"chars": 522,
"preview": "# https://wandb.ai\n\nwandb:\n _target_: lightning.pytorch.loggers.wandb.WandbLogger\n # name: \"\" # name of the run (norma"
},
{
"path": "third_party/Matcha-TTS/configs/model/cfm/default.yaml",
"chars": 40,
"preview": "name: CFM\nsolver: euler\nsigma_min: 1e-4\n"
},
{
"path": "third_party/Matcha-TTS/configs/model/decoder/default.yaml",
"chars": 119,
"preview": "channels: [256, 256]\ndropout: 0.05\nattention_head_dim: 64\nn_blocks: 1\nnum_mid_blocks: 2\nnum_heads: 2\nact_fn: snakebeta\n"
},
{
"path": "third_party/Matcha-TTS/configs/model/encoder/default.yaml",
"chars": 417,
"preview": "encoder_type: RoPE Encoder\nencoder_params:\n n_feats: ${model.n_feats}\n n_channels: 192\n filter_channels: 768\n filter"
},
{
"path": "third_party/Matcha-TTS/configs/model/matcha.yaml",
"chars": 328,
"preview": "defaults:\n - _self_\n - encoder: default.yaml\n - decoder: default.yaml\n - cfm: default.yaml\n - optimizer: adam.yaml\n"
},
{
"path": "third_party/Matcha-TTS/configs/model/optimizer/adam.yaml",
"chars": 70,
"preview": "_target_: torch.optim.Adam\n_partial_: true\nlr: 1e-4\nweight_decay: 0.0\n"
},
{
"path": "third_party/Matcha-TTS/configs/paths/default.yaml",
"chars": 632,
"preview": "# path to root directory\n# this requires PROJECT_ROOT environment variable to exist\n# you can replace it with \".\" if you"
},
{
"path": "third_party/Matcha-TTS/configs/train.yaml",
"chars": 1557,
"preview": "# @package _global_\n\n# specify here default configuration\n# order of defaults determines the order in which configs over"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/cpu.yaml",
"chars": 51,
"preview": "defaults:\n - default\n\naccelerator: cpu\ndevices: 1\n"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/ddp.yaml",
"chars": 104,
"preview": "defaults:\n - default\n\nstrategy: ddp\n\naccelerator: gpu\ndevices: [0,1]\nnum_nodes: 1\nsync_batchnorm: True\n"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml",
"chars": 115,
"preview": "defaults:\n - default\n\n# simulate DDP on CPU, useful for debugging\naccelerator: cpu\ndevices: 2\nstrategy: ddp_spawn\n"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/default.yaml",
"chars": 439,
"preview": "_target_: lightning.pytorch.trainer.Trainer\n\ndefault_root_dir: ${paths.output_dir}\n\nmax_epochs: -1\n\naccelerator: gpu\ndev"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/gpu.yaml",
"chars": 51,
"preview": "defaults:\n - default\n\naccelerator: gpu\ndevices: 1\n"
},
{
"path": "third_party/Matcha-TTS/configs/trainer/mps.yaml",
"chars": 51,
"preview": "defaults:\n - default\n\naccelerator: mps\ndevices: 1\n"
},
{
"path": "third_party/Matcha-TTS/matcha/VERSION",
"chars": 8,
"preview": "0.0.5.1\n"
},
{
"path": "third_party/Matcha-TTS/matcha/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/app.py",
"chars": 13981,
"preview": "import tempfile\nfrom argparse import Namespace\nfrom pathlib import Path\n\nimport gradio as gr\nimport soundfile as sf\nimpo"
},
{
"path": "third_party/Matcha-TTS/matcha/cli.py",
"chars": 15467,
"preview": "import argparse\nimport datetime as dt\nimport os\nimport warnings\nfrom pathlib import Path\n\nimport matplotlib.pyplot as pl"
},
{
"path": "third_party/Matcha-TTS/matcha/data/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/data/components/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py",
"chars": 7555,
"preview": "import random\nfrom typing import Any, Dict, Optional\n\nimport torch\nimport torchaudio as ta\nfrom lightning import Lightni"
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/LICENSE",
"chars": 1068,
"preview": "MIT License\n\nCopyright (c) 2020 Jungil Kong\n\nPermission is hereby granted, free of charge, to any person obtaining a cop"
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/README.md",
"chars": 5570,
"preview": "# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis\n\n### Jungil Kong, Jaehyeon "
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/config.py",
"chars": 779,
"preview": "v1 = {\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 16,\n \"learning_rate\": 0.0004,\n \"adam_b1\": 0.8,\n "
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/denoiser.py",
"chars": 2644,
"preview": "# Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea"
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/env.py",
"chars": 429,
"preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport os\nimport shutil\n\n\nclass AttrDict(dict):\n def __init__(self, "
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/meldataset.py",
"chars": 6786,
"preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport math\nimport os\nimport random\n\nimport numpy as np\nimport torch\nim"
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/models.py",
"chars": 11668,
"preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom"
},
{
"path": "third_party/Matcha-TTS/matcha/hifigan/xutils.py",
"chars": 1396,
"preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport glob\nimport os\n\nimport matplotlib\nimport torch\nfrom torch.nn.uti"
},
{
"path": "third_party/Matcha-TTS/matcha/models/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/models/baselightningmodule.py",
"chars": 7003,
"preview": "\"\"\"\nThis is a base lightning module that can be used to train a model.\nThe benefit of this abstraction is that all the l"
},
{
"path": "third_party/Matcha-TTS/matcha/models/components/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/models/components/decoder.py",
"chars": 14459,
"preview": "import math\nfrom typing import Optional\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom conform"
},
{
"path": "third_party/Matcha-TTS/matcha/models/components/flow_matching.py",
"chars": 4657,
"preview": "from abc import ABC\n\nimport torch\nimport torch.nn.functional as F\n\nfrom matcha.models.components.decoder import Decoder\n"
},
{
"path": "third_party/Matcha-TTS/matcha/models/components/text_encoder.py",
"chars": 14845,
"preview": "\"\"\" from https://github.com/jaywalnut310/glow-tts \"\"\"\n\nimport math\n\nimport torch\nimport torch.nn as nn\nfrom einops impor"
},
{
"path": "third_party/Matcha-TTS/matcha/models/components/transformer.py",
"chars": 13235,
"preview": "from typing import Any, Dict, Optional\n\nimport torch\nimport torch.nn as nn\nfrom diffusers.models.attention import (\n "
},
{
"path": "third_party/Matcha-TTS/matcha/models/matcha_tts.py",
"chars": 10056,
"preview": "import datetime as dt\nimport math\nimport random\n\nimport torch\n\nimport matcha.utils.monotonic_align as monotonic_align\nfr"
},
{
"path": "third_party/Matcha-TTS/matcha/onnx/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/matcha/onnx/export.py",
"chars": 5377,
"preview": "import argparse\nimport random\nfrom pathlib import Path\n\nimport numpy as np\nimport torch\nfrom lightning import LightningM"
},
{
"path": "third_party/Matcha-TTS/matcha/onnx/infer.py",
"chars": 6287,
"preview": "import argparse\nimport os\nimport warnings\nfrom pathlib import Path\nfrom time import perf_counter\n\nimport numpy as np\nimp"
},
{
"path": "third_party/Matcha-TTS/matcha/text/__init__.py",
"chars": 1696,
"preview": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\nfrom matcha.text import cleaners\nfrom matcha.text.symbols import symbo"
},
{
"path": "third_party/Matcha-TTS/matcha/text/cleaners.py",
"chars": 3560,
"preview": "\"\"\" from https://github.com/keithito/tacotron\n\nCleaners are transformations that run over the input text at both trainin"
},
{
"path": "third_party/Matcha-TTS/matcha/text/numbers.py",
"chars": 2248,
"preview": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\n\nimport re\n\nimport inflect\n\n_inflect = inflect.engine()\n_comma_number_"
},
{
"path": "third_party/Matcha-TTS/matcha/text/symbols.py",
"chars": 509,
"preview": "\"\"\" from https://github.com/keithito/tacotron\n\nDefines the set of symbols used in text input to the model.\n\"\"\"\n_pad = \"_"
},
{
"path": "third_party/Matcha-TTS/matcha/train.py",
"chars": 4613,
"preview": "from typing import Any, Dict, List, Optional, Tuple\n\nimport hydra\nimport lightning as L\nimport rootutils\nfrom lightning "
},
{
"path": "third_party/Matcha-TTS/matcha/utils/__init__.py",
"chars": 326,
"preview": "from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers\nfrom matcha.utils.logging_utils import"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/audio.py",
"chars": 2282,
"preview": "import numpy as np\nimport torch\nimport torch.utils.data\nfrom librosa.filters import mel as librosa_mel_fn\nfrom scipy.io."
},
{
"path": "third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py",
"chars": 3269,
"preview": "r\"\"\"\nThe file creates a pickle file where the values needed for loading of dataset is stored and the model can load it\nw"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/instantiators.py",
"chars": 1828,
"preview": "from typing import List\n\nimport hydra\nfrom lightning import Callback\nfrom lightning.pytorch.loggers import Logger\nfrom o"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/logging_utils.py",
"chars": 1711,
"preview": "from typing import Any, Dict\n\nfrom lightning.pytorch.utilities import rank_zero_only\nfrom omegaconf import OmegaConf\n\nfr"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/model.py",
"chars": 2935,
"preview": "\"\"\" from https://github.com/jaywalnut310/glow-tts \"\"\"\n\nimport numpy as np\nimport torch\n\n\ndef sequence_mask(length, max_l"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py",
"chars": 646,
"preview": "import numpy as np\nimport torch\n\nfrom matcha.utils.monotonic_align.core import maximum_path_c\n\n\ndef maximum_path(value, "
},
{
"path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/core.c",
"chars": 867828,
"preview": "/* Generated by Cython 0.29.35 */\n\n/* BEGIN: Cython Metadata\n{\n \"distutils\": {\n \"depends\": [],\n \"name\":"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx",
"chars": 1236,
"preview": "import numpy as np\n\ncimport cython\ncimport numpy as np\n\nfrom cython.parallel import prange\n\n\n@cython.boundscheck(False)\n"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py",
"chars": 207,
"preview": "# from distutils.core import setup\n# from Cython.Build import cythonize\n# import numpy\n\n# setup(name='monotonic_align',\n"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/pylogger.py",
"chars": 720,
"preview": "import logging\n\nfrom lightning.pytorch.utilities import rank_zero_only\n\n\ndef get_pylogger(name: str = __name__) -> loggi"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/rich_utils.py",
"chars": 3279,
"preview": "from pathlib import Path\nfrom typing import Sequence\n\nimport rich\nimport rich.syntax\nimport rich.tree\nfrom hydra.core.hy"
},
{
"path": "third_party/Matcha-TTS/matcha/utils/utils.py",
"chars": 7159,
"preview": "import os\nimport sys\nimport warnings\nfrom importlib.util import find_spec\nfrom pathlib import Path\nfrom typing import An"
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/PKG-INFO",
"chars": 9844,
"preview": "Metadata-Version: 2.1\nName: matcha-tts\nVersion: 0.0.5.1\nSummary: 🍵 Matcha-TTS: A fast TTS architecture with conditional "
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/SOURCES.txt",
"chars": 1518,
"preview": "LICENSE\nMANIFEST.in\nREADME.md\npyproject.toml\nrequirements.txt\nsetup.py\nconfigs/__init__.py\nmatcha/VERSION\nmatcha/__init_"
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/dependency_links.txt",
"chars": 1,
"preview": "\n"
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/entry_points.txt",
"chars": 142,
"preview": "[console_scripts]\nmatcha-data-stats = matcha.utils.generate_data_statistics:main\nmatcha-tts = matcha.cli:cli\nmatcha-tts-"
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/requires.txt",
"chars": 381,
"preview": "torch>=2.0.0\ntorchvision>=0.15.0\nlightning>=2.0.0\ntorchmetrics>=0.11.4\nhydra-core==1.3.2\nhydra-colorlog==1.2.0\nhydra-opt"
},
{
"path": "third_party/Matcha-TTS/matcha_tts.egg-info/top_level.txt",
"chars": 15,
"preview": "configs\nmatcha\n"
},
{
"path": "third_party/Matcha-TTS/notebooks/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "third_party/Matcha-TTS/pyproject.toml",
"chars": 982,
"preview": "[build-system]\nrequires = [\"setuptools\", \"wheel\", \"cython==0.29.35\", \"numpy==1.24.3\", \"packaging\"]\n\n[tool.black]\nline-le"
},
{
"path": "third_party/Matcha-TTS/requirements.txt",
"chars": 904,
"preview": "# --------- pytorch --------- #\ntorch>=2.0.0\ntorchvision>=0.15.0\nlightning>=2.0.0\ntorchmetrics>=0.11.4\n\n# --------- hydr"
},
{
"path": "third_party/Matcha-TTS/scripts/schedule.sh",
"chars": 207,
"preview": "#!/bin/bash\n# Schedule execution of many runs\n# Run from root folder with: bash scripts/schedule.sh\n\npython src/train.py"
},
{
"path": "third_party/Matcha-TTS/setup.py",
"chars": 1527,
"preview": "#!/usr/bin/env python\nimport os\n\nimport numpy\nfrom Cython.Build import cythonize\nfrom setuptools import Extension, find_"
},
{
"path": "third_party/Matcha-TTS/synthesis.ipynb",
"chars": 590014,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"id\": \"f37f4e3b-f764-4502-a6a2-6417bd9bfab9\",\n \"metadata\": {},\n \"so"
},
{
"path": "utils/word_utils.py",
"chars": 1624178,
"preview": "from collections import Counter, defaultdict\n\nalways_augment_chars = {\"長\"}\n\nchar2phn = {\n \"〇\": [\n \"ㄌㄧㄥ2\",\n "
}
]
About this extraction
This page contains the full source code of the mtkresearch/BreezyVoice GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 158 files (3.5 MB), approximately 913.5k tokens, and a symbol index with 1094 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.