Full Code of mtkresearch/BreezyVoice for AI

main d592c9d3e892 cached
158 files
3.5 MB
913.5k tokens
1094 symbols
1 requests
Download .txt
Showing preview only (4,015K chars total). Download the full file or copy to clipboard to get everything.
Repository: mtkresearch/BreezyVoice
Branch: main
Commit: d592c9d3e892
Files: 158
Total size: 3.5 MB

Directory structure:
gitextract_wldclo_i/

├── .dockerignore
├── Dockerfile
├── LICENSE
├── README.md
├── api.py
├── batch_inference.py
├── compose.yaml
├── cosyvoice/
│   ├── __init__.py
│   ├── bin/
│   │   ├── inference.py
│   │   └── train.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── cosyvoice.py
│   │   ├── frontend.py
│   │   └── model.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── dataset.py
│   │   └── processor.py
│   ├── flow/
│   │   ├── decoder.py
│   │   ├── flow.py
│   │   ├── flow_matching.py
│   │   └── length_regulator.py
│   ├── hifigan/
│   │   ├── f0_predictor.py
│   │   └── generator.py
│   ├── llm/
│   │   └── llm.py
│   ├── transformer/
│   │   ├── __init__.py
│   │   ├── activation.py
│   │   ├── attention.py
│   │   ├── convolution.py
│   │   ├── decoder.py
│   │   ├── decoder_layer.py
│   │   ├── embedding.py
│   │   ├── encoder.py
│   │   ├── encoder_layer.py
│   │   ├── label_smoothing_loss.py
│   │   ├── positionwise_feed_forward.py
│   │   └── subsampling.py
│   └── utils/
│       ├── __init__.py
│       ├── class_utils.py
│       ├── common.py
│       ├── executor.py
│       ├── file_utils.py
│       ├── frontend_utils.py
│       ├── mask.py
│       ├── scheduler.py
│       └── train_utils.py
├── data/
│   └── batch_files.csv
├── openai_api_inference.py
├── requirements.txt
├── results/
│   └── .gitkeep
├── run_batch_inference.sh
├── run_single_inference.sh
├── single_inference.py
├── third_party/
│   └── Matcha-TTS/
│       ├── LICENSE
│       ├── MANIFEST.in
│       ├── Makefile
│       ├── README.md
│       ├── configs/
│       │   ├── __init__.py
│       │   ├── callbacks/
│       │   │   ├── default.yaml
│       │   │   ├── model_checkpoint.yaml
│       │   │   ├── model_summary.yaml
│       │   │   ├── none.yaml
│       │   │   └── rich_progress_bar.yaml
│       │   ├── data/
│       │   │   ├── hi-fi_en-US_female.yaml
│       │   │   ├── ljspeech.yaml
│       │   │   └── vctk.yaml
│       │   ├── debug/
│       │   │   ├── default.yaml
│       │   │   ├── fdr.yaml
│       │   │   ├── limit.yaml
│       │   │   ├── overfit.yaml
│       │   │   └── profiler.yaml
│       │   ├── eval.yaml
│       │   ├── experiment/
│       │   │   ├── hifi_dataset_piper_phonemizer.yaml
│       │   │   ├── ljspeech.yaml
│       │   │   ├── ljspeech_min_memory.yaml
│       │   │   └── multispeaker.yaml
│       │   ├── extras/
│       │   │   └── default.yaml
│       │   ├── hparams_search/
│       │   │   └── mnist_optuna.yaml
│       │   ├── hydra/
│       │   │   └── default.yaml
│       │   ├── local/
│       │   │   └── .gitkeep
│       │   ├── logger/
│       │   │   ├── aim.yaml
│       │   │   ├── comet.yaml
│       │   │   ├── csv.yaml
│       │   │   ├── many_loggers.yaml
│       │   │   ├── mlflow.yaml
│       │   │   ├── neptune.yaml
│       │   │   ├── tensorboard.yaml
│       │   │   └── wandb.yaml
│       │   ├── model/
│       │   │   ├── cfm/
│       │   │   │   └── default.yaml
│       │   │   ├── decoder/
│       │   │   │   └── default.yaml
│       │   │   ├── encoder/
│       │   │   │   └── default.yaml
│       │   │   ├── matcha.yaml
│       │   │   └── optimizer/
│       │   │       └── adam.yaml
│       │   ├── paths/
│       │   │   └── default.yaml
│       │   ├── train.yaml
│       │   └── trainer/
│       │       ├── cpu.yaml
│       │       ├── ddp.yaml
│       │       ├── ddp_sim.yaml
│       │       ├── default.yaml
│       │       ├── gpu.yaml
│       │       └── mps.yaml
│       ├── matcha/
│       │   ├── VERSION
│       │   ├── __init__.py
│       │   ├── app.py
│       │   ├── cli.py
│       │   ├── data/
│       │   │   ├── __init__.py
│       │   │   ├── components/
│       │   │   │   └── __init__.py
│       │   │   └── text_mel_datamodule.py
│       │   ├── hifigan/
│       │   │   ├── LICENSE
│       │   │   ├── README.md
│       │   │   ├── __init__.py
│       │   │   ├── config.py
│       │   │   ├── denoiser.py
│       │   │   ├── env.py
│       │   │   ├── meldataset.py
│       │   │   ├── models.py
│       │   │   └── xutils.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── baselightningmodule.py
│       │   │   ├── components/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── decoder.py
│       │   │   │   ├── flow_matching.py
│       │   │   │   ├── text_encoder.py
│       │   │   │   └── transformer.py
│       │   │   └── matcha_tts.py
│       │   ├── onnx/
│       │   │   ├── __init__.py
│       │   │   ├── export.py
│       │   │   └── infer.py
│       │   ├── text/
│       │   │   ├── __init__.py
│       │   │   ├── cleaners.py
│       │   │   ├── numbers.py
│       │   │   └── symbols.py
│       │   ├── train.py
│       │   └── utils/
│       │       ├── __init__.py
│       │       ├── audio.py
│       │       ├── generate_data_statistics.py
│       │       ├── instantiators.py
│       │       ├── logging_utils.py
│       │       ├── model.py
│       │       ├── monotonic_align/
│       │       │   ├── __init__.py
│       │       │   ├── core.c
│       │       │   ├── core.pyx
│       │       │   └── setup.py
│       │       ├── pylogger.py
│       │       ├── rich_utils.py
│       │       └── utils.py
│       ├── matcha_tts.egg-info/
│       │   ├── PKG-INFO
│       │   ├── SOURCES.txt
│       │   ├── dependency_links.txt
│       │   ├── entry_points.txt
│       │   ├── requires.txt
│       │   └── top_level.txt
│       ├── notebooks/
│       │   └── .gitkeep
│       ├── pyproject.toml
│       ├── requirements.txt
│       ├── scripts/
│       │   └── schedule.sh
│       ├── setup.py
│       └── synthesis.ipynb
└── utils/
    └── word_utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore
.gitattributes


# CI
.codeclimate.yml
.travis.yml
.taskcluster.yml

# Docker
docker-compose.yml
Dockerfile
.docker
.dockerignore

# Byte-compiled / optimized / DLL files
**/__pycache__/
**/*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Virtual environment
.env
.venv/
venv/

# PyCharm
.idea

# Python mode for VIM
.ropeproject
**/.ropeproject

# Vim swap files
**/*.swp

# VS Code
.vscode/


================================================
FILE: Dockerfile
================================================
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
WORKDIR /breezyvoice

ENV UV_LINK_MODE=copy
ENV PATH="/root/.local/bin/:$PATH"

ADD https://astral.sh/uv/install.sh /uv-installer.sh

RUN apt-get update && \
    apt-get install -y --no-install-recommends curl ca-certificates ffmpeg&& \
    sh /uv-installer.sh && rm /uv-installer.sh && \
    apt-get clean && rm -rf /var/lib/apt/lists/* && \
    uv venv -p 3.10

COPY requirements.txt /breezyvoice/requirements.txt

RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -r requirements.txt --index-strategy unsafe-best-match

COPY . .

EXPOSE 8080

ENTRYPOINT ["/breezyvoice/.venv/bin/python"]


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# BreezyVoice

BreezyVoice is a voice-cloning text-to-speech system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities via auxiliary 注音 (bopomofo) inputs. BreezyVoice is partially derived from [CosyVoice](https://github.com/FunAudioLLM/CosyVoice). BreezyVoice is part of the [Breeze2 family](https://huggingface.co/collections/MediaTek-Research/breeze2-family-67863158443a06a72dd29900)

<img src="https://raw.githubusercontent.com/mtkresearch/BreezyVoice/main/images/flowchart.png" alt="flowchart" width="700"/>

🚀 **Try out our interactive [UI playground](https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground) now!** 🚀 

🚀 **[立即體驗 BreezyVoice 語音合成](https://huggingface.co/spaces/Splend1dchan/BreezyVoice-Playground) !** 🚀 

Or visit one of these resources:  
- [Playground (CLI Inference)](https://www.kaggle.com/code/a24998667/breezyvoice-playground)  
- [Model](https://huggingface.co/MediaTek-Research/BreezyVoice/tree/main)  
- [Paper](https://arxiv.org/abs/2501.17790) 


Repo Main Contributors: Chia-Chun Lin, Chan-Jan Hsu

## Features
🔥 BreezyVoice outperforms competing commercial services in terms of naturalness.



<img src="https://raw.githubusercontent.com/mtkresearch/BreezyVoice/main/images/comparisons.png" alt="comparisons" width="350"/>

 🔥 BreezyVoice is highly competitive at code-switching scenarios.

| Code-Switching Term Category        | **BreezyVoice**  | Z | Y | U | M |
|-------------|--------------|---|---|---|---|
| **General Words** | **8**            | 5 | **8** | **8** | 7 |
| **Entities**| **9**         | 6 | 4 | 7 | 4 |
| **Abbreviations**   | **9**            | 8 | 6 | 6 | 7 |
| **Toponyms**| 3            | 3 | **7** | 3 | 4 |
| **Full Sentences**| 7           | 7 | **8** | 5 | 3 |

🔥 BreezyVoice supports automatic 注音 annotation, as well as manual 注音 correction (See Inference).


## Install

**Clone and install**

- Clone the repo
``` sh
git clone https://github.com/mtkresearch/BreezyVoice.git
# If you failed to clone submodule due to network failures, please run following command until success
cd BreezyVoice
```

- Install Requirements (requires Python3.10)
```
pip uninstall onnxruntime # use onnxruntime-gpu instead of onnxruntime
pip install -r requirements.txt
```
(The model is runnable on CPU, please change onnxruntime-gpu to onnxruntime in `requirements.txt` if you do not have GPU in your environment)

You might need to install cudnn depending on cuda version
```
sudo apt-get -y install cudnn9-cuda-11
```
## Inference

UTF8 encoding is required:

``` sh
export PYTHONUTF8=1
```
---
**Run single_inference.py with the following arguments:**

- `--content_to_synthesize`:
    - **Description**: Specifies the content that will be synthesized into speech. Phonetic symbols can optionally be included but should be used sparingly, as shown in the examples below:
    - Simple text: `"今天天氣真好"`
    - Text with phonetic symbols: `"今天天氣真好[:ㄏㄠ3]"`

- `--speaker_prompt_audio_path`:
  - **Description**: Specifies the path to the prompt speech audio file for setting the style of the speaker. Use your custom audio file or our example file:
    - Example audio: `./data/tc_speaker.wav`

- `--speaker_prompt_text_transcription` (optional):
  - **Description**: Specifies the transcription of the speaker prompt audio. Providing this input is highly recommended for better accuracy. If not provided, the system will automatically transcribe the audio using Whisper.
  - Example text for the audio file: `"在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程才能將密文還原為正常可讀的內容。"`

- `--output_path` (optional):
  - **Description**: Specifies the name and path for the output `.wav` file. If not provided, the default path is used.
  - **Default Value**: `results/output.wav`
  - Example: `[your_file_name].wav`

- `--model_path` (optional):
  - **Description**: Specifies the pre-trained model used for speech synthesis.
  - **Default Value**: `MediaTek-Research/BreezyVoice`

**Example Usage:**

``` bash
bash run_single_inference.sh
```

``` python
# python single_inference.py --text_to_speech [text to be converted into audio] --text_prompt [the prompt of that audio file] --audio_path [reference audio file]
python single_inference.py --content_to_synthesize "今天天氣真好" --speaker_prompt_text_transcription "在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程才能將密文還原為正常可讀的內容。" --speaker_prompt_audio_path "./data/example.wav"
```

``` python
# python single_inference.py --text_to_speech [text to be converted into audio] --audio_path [reference audio file]
python single_inference.py --content_to_synthesize "今天天氣真好[:ㄏㄠ3]" --speaker_prompt_audio_path "./data/example.wav"
```

---

**Run `batch_inference.py` with the following arguments:**

- `--csv_file`:
  - **Description**: Path to the CSV file that contains the input data for batch processing.
  - **Example**: `./data/batch_files.csv`

- `--speaker_prompt_audio_folder`:
  - **Description**: Path to the folder containing the speaker prompt audio files. The files in this folder are used to set the style of the speaker for each synthesis task.
  - **Example**: `./data`

- `--output_audio_folder`:
  - **Description**: Path to the folder where the output audio files will be saved. Each processed row in the CSV will result in a synthesized audio file stored in this folder.
  - **Example**: `./results`

**CSV File Structure:**

The CSV file should contain the following columns:

- **`speaker_prompt_audio_filename`**:
  - **Description**: The filename (without extension) of the speaker prompt audio file that will be used to guide the style of the generated speech.
  - **Example**: `example`

- **`speaker_prompt_text_transcription`**:
  - **Description**: The transcription of the speaker prompt audio. This field is optional but highly recommended to improve transcription accuracy. If not provided, the system will attempt to transcribe the audio using Whisper.
  - **Example**: `"在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。"`

- **`content_to_synthesize`**:
  - **Description**: The content that will be synthesized into speech. You can include phonetic symbols if needed, though they should be used sparingly.
  - **Example**: `"今天天氣真好"`

- **`output_audio_filename`**:
  - **Description**: The filename (without extension) for the generated output audio. The audio will be saved as a `.wav` file in the output folder.
  - **Example**: `output`

**Example Usage:**

``` bash
bash run_batch_inference.sh
```
```bash
python batch_inference.py \
  --csv_file ./data/batch_files.csv \
  --speaker_prompt_audio_folder ./data \
  --output_audio_folder ./results
```

### Docker and OpenAI Compatible API

``` bash
$ docker compose up -d --build
# after the container is up
$ pip install openai
$ python openai_api_inference.py
```

---

If you like our work, please cite:

```
@article{hsu2025breezyvoice,
  title={BreezyVoice: Adapting TTS for Taiwanese Mandarin with Enhanced Polyphone Disambiguation--Challenges and Insights},
  author={Hsu, Chan-Jan and Lin, Yi-Cheng and Lin, Chia-Chun and Chen, Wei-Chih and Chung, Ho Lam and Li, Chen-An and Chen, Yi-Chang and Yu, Chien-Yu and Lee, Ming-Ji and Chen, Chien-Cheng and others},
  journal={arXiv preprint arXiv:2501.17790},
  year={2025}
}
@article{hsu2025breeze,
  title={The Breeze 2 Herd of Models: Traditional Chinese LLMs Based on Llama with Vision-Aware and Function-Calling Capabilities},
  author={Hsu, Chan-Jan and Liu, Chia-Sheng and Chen, Meng-Hsi and Chen, Muxi and Hsu, Po-Chun and Chen, Yi-Chang and Shiu, Da-Shan},
  journal={arXiv preprint arXiv:2501.13921},
  year={2025}
}
@article{du2024cosyvoice,
  title={Cosyvoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens},
  author={Du, Zhihao and Chen, Qian and Zhang, Shiliang and Hu, Kai and Lu, Heng and Yang, Yexin and Hu, Hangrui and Zheng, Siqi and Gu, Yue and Ma, Ziyang and others},
  journal={arXiv preprint arXiv:2407.05407},
  year={2024}
}
```


================================================
FILE: api.py
================================================
# OpenAI API Spec. Reference: https://platform.openai.com/docs/api-reference/audio/createSpeech

from contextlib import asynccontextmanager
from io import BytesIO

import torchaudio
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from g2pw import G2PWConverter
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings

from cosyvoice.utils.file_utils import load_wav
from single_inference import CustomCosyVoice, get_bopomofo_rare


class Settings(BaseSettings):
    api_key: str = Field(
        default="", description="Specifies the API key used to authenticate the user."
    )

    model_path: str = Field(
        default="MediaTek-Research/BreezyVoice",
        description="Specifies the model used for speech synthesis.",
    )
    speaker_prompt_audio_path: str = Field(
        default="./data/example.wav",
        description="Specifies the path to the prompt speech audio file of the speaker.",
    )
    speaker_prompt_text_transcription: str = Field(
        default="在密碼學中,加密是將明文資訊改變為難以讀取的密文內容,使之不可讀的方法。只有擁有解密方法的對象,經由解密過程,才能將密文還原為正常可讀的內容。",
        description="Specifies the transcription of the speaker prompt audio.",
    )


class SpeechRequest(BaseModel):
    model: str = ""
    input: str = Field(
        description="The content that will be synthesized into speech. You can include phonetic symbols if needed, though they should be used sparingly.",
        examples=["今天天氣真好"],
    )
    response_format: str = ""
    speed: float = 1.0


@asynccontextmanager
async def lifespan(app: FastAPI):
    app.state.settings = Settings()
    app.state.cosyvoice = CustomCosyVoice(app.state.settings.model_path)
    app.state.bopomofo_converter = G2PWConverter()
    app.state.prompt_speech_16k = load_wav(
        app.state.settings.speaker_prompt_audio_path, 16000
    )
    yield
    del app.state.cosyvoice
    del app.state.bopomofo_converter


app = FastAPI(lifespan=lifespan, root_path="/v1")


@app.get("/models")
async def get_models(request: Request):
    return {
        "object": "list",
        "data": [
            {
                "id": request.app.state.settings.model_path,
                "object": "model",
                "created": 0,
                "owned_by": "local",
            }
        ],
    }


@app.post("/audio/speech")
async def speach_endpoint(request: Request, payload: SpeechRequest):
    # normalization
    speaker_prompt_text_transcription = (
        request.app.state.cosyvoice.frontend.text_normalize_new(
            request.app.state.settings.speaker_prompt_text_transcription, split=False
        )
    )
    content_to_synthesize = request.app.state.cosyvoice.frontend.text_normalize_new(
        payload.input, split=False
    )
    speaker_prompt_text_transcription_bopomo = get_bopomofo_rare(
        speaker_prompt_text_transcription, request.app.state.bopomofo_converter
    )

    content_to_synthesize_bopomo = get_bopomofo_rare(
        content_to_synthesize, request.app.state.bopomofo_converter
    )
    output = request.app.state.cosyvoice.inference_zero_shot_no_normalize(
        content_to_synthesize_bopomo,
        speaker_prompt_text_transcription_bopomo,
        request.app.state.prompt_speech_16k,
    )
    audio_buffer = BytesIO()
    torchaudio.save(audio_buffer, output["tts_speech"], 22050, format="wav")
    audio_buffer.seek(0)
    return StreamingResponse(
        audio_buffer,
        media_type="audio/wav",
        headers={"Content-Disposition": "attachment; filename=output.wav"},
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("api:app", host="0.0.0.0", port=8080)


================================================
FILE: batch_inference.py
================================================
import os
import time
import subprocess
import argparse
import pandas as pd
from datasets import Dataset
from single_inference import single_inference, CustomCosyVoice
from g2pw import G2PWConverter


def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_folder, model):
    # Load CSV with pandas
    data = pd.read_csv(csv_file)

    # Transform pandas DataFrame to HuggingFace Dataset
    dataset = Dataset.from_pandas(data)
    dataset = dataset.shuffle(seed = int(time.time()*1000))

    cosyvoice, bopomofo_converter = model

    def gen_audio(row):
        speaker_prompt_audio_path = os.path.join(speaker_prompt_audio_folder, f"{row['speaker_prompt_audio_filename']}.wav")
        speaker_prompt_text_transcription = row['speaker_prompt_text_transcription']
        content_to_synthesize = row['content_to_synthesize']
        output_audio_path = os.path.join(output_audio_folder, f"{row['output_audio_filename']}.wav")

        if not os.path.exists(speaker_prompt_audio_path):
            print(f"File {speaker_prompt_audio_path} does not exist")
            return row #{"status": "failed", "reason": "file not found"}
        if not os.path.exists(output_audio_path):
            single_inference(speaker_prompt_audio_path, content_to_synthesize, output_audio_path, cosyvoice, bopomofo_converter, speaker_prompt_text_transcription)
        else:
            pass
        # command = [
        #     "python", "single_inference.py",
        #     "--speaker_prompt_audio_path", speaker_prompt_audio_path,
        #     "--speaker_prompt_text_transcription", speaker_prompt_text_transcription,
        #     "--content_to_synthesize", content_to_synthesize,
        #     "--output_path", output_audio_path
        # ]

        # try:
        #     print(f"Processing: {speaker_prompt_audio_path}")
        #     subprocess.run(command, check=True)
        #     print(f"Generated: {output_audio_path}")
        #     return row #{"status": "success", "output": gen_voice_file_name}
        # except subprocess.CalledProcessError as e:
        #     print(f"Failed to generate {speaker_prompt_audio_path}, error: {e}")
        #     return row #{"status": "failed", "reason": str(e)}

    dataset = dataset.map(gen_audio, num_proc = 1)

def main():
    parser = argparse.ArgumentParser(description="Batch process audio generation.")
    parser.add_argument("--csv_file", required=True, help="Path to the CSV file containing input data.")
    parser.add_argument("--speaker_prompt_audio_folder", required=True, help="Path to the folder containing speaker prompt audio files.")
    parser.add_argument("--output_audio_folder", required=True, help="Path to the folder where results will be stored.")
    parser.add_argument("--model_path", type=str, required=False, default = "MediaTek-Research/BreezyVoice-300M",help="Specifies the model used for speech synthesis.")

    args = parser.parse_args()

    cosyvoice = CustomCosyVoice(args.model_path)
    bopomofo_converter = G2PWConverter()

    os.makedirs(args.output_audio_folder, exist_ok=True)

    process_batch(
        csv_file=args.csv_file,
        speaker_prompt_audio_folder=args.speaker_prompt_audio_folder,
        output_audio_folder=args.output_audio_folder,
        model = (cosyvoice, bopomofo_converter),

    )

if __name__ == "__main__":
    main()



================================================
FILE: compose.yaml
================================================
services:
  app:
    image: breezyvoice:latest
    build: .
    ports:
      - "8080:8080"
    volumes:
      - hf-cache:/root/.cache/huggingface/
    command: api.py
    init: true
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
volumes:
  hf-cache:

================================================
FILE: cosyvoice/__init__.py
================================================


================================================
FILE: cosyvoice/bin/inference.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import argparse
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
import os

import torch
from torch.utils.data import DataLoader
import torchaudio
from hyperpyyaml import load_hyperpyyaml
from tqdm import tqdm
from cosyvoice.cli.model import CosyVoiceModel

from cosyvoice.dataset.dataset import Dataset

def get_args():
    parser = argparse.ArgumentParser(description='inference with your model')
    parser.add_argument('--config', required=True, help='config file')
    parser.add_argument('--prompt_data', required=True, help='prompt data file')
    parser.add_argument('--prompt_utt2data', required=True, help='prompt data file')
    parser.add_argument('--tts_text', required=True, help='tts input file')
    parser.add_argument('--llm_model', required=True, help='llm model file')
    parser.add_argument('--flow_model', required=True, help='flow model file')
    parser.add_argument('--hifigan_model', required=True, help='hifigan model file')
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='gpu id for this rank, -1 for cpu')
    parser.add_argument('--mode',
                        default='sft',
                        choices=['sft', 'zero_shot'],
                        help='inference mode')
    parser.add_argument('--result_dir', required=True, help='asr result file')
    args = parser.parse_args()
    print(args)
    return args


def main():
    args = get_args()
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s')
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    # Init cosyvoice models from configs
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    with open(args.config, 'r') as f:
        configs = load_hyperpyyaml(f)

    model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
    model.load(args.llm_model, args.flow_model, args.hifigan_model)

    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=False, partition=False, tts_file=args.tts_text, prompt_utt2data=args.prompt_utt2data)
    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)

    del configs
    os.makedirs(args.result_dir, exist_ok=True)
    fn = os.path.join(args.result_dir, 'wav.scp')
    f = open(fn, 'w')
    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(test_data_loader)):
            utts = batch["utts"]
            assert len(utts) == 1, "inference mode only support batchsize 1"
            text = batch["text"]
            text_token = batch["text_token"].to(device)
            text_token_len = batch["text_token_len"].to(device)
            tts_text = batch["tts_text"]
            tts_index = batch["tts_index"]
            tts_text_token = batch["tts_text_token"].to(device)
            tts_text_token_len = batch["tts_text_token_len"].to(device)
            speech_token = batch["speech_token"].to(device)
            speech_token_len = batch["speech_token_len"].to(device)
            speech_feat = batch["speech_feat"].to(device)
            speech_feat_len = batch["speech_feat_len"].to(device)
            utt_embedding = batch["utt_embedding"].to(device)
            spk_embedding = batch["spk_embedding"].to(device)
            if args.mode == 'sft':
                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
                               'llm_embedding': spk_embedding, 'flow_embedding': spk_embedding}
            else:
                model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
                               'prompt_text': text_token, 'prompt_text_len': text_token_len,
                               'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
                               'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
                               'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                               'llm_embedding': utt_embedding, 'flow_embedding': utt_embedding}
            model_output = model.inference(**model_input)
            tts_key = '{}_{}'.format(utts[0], tts_index[0])
            tts_fn = os.path.join(args.result_dir, '{}.wav'.format(tts_key))
            torchaudio.save(tts_fn, model_output['tts_speech'], sample_rate=22050)
            f.write('{} {}\n'.format(tts_key, tts_fn))
            f.flush()
    f.close()
    logging.info('Result wav.scp saved in {}'.format(fn))


if __name__ == '__main__':
    main()


================================================
FILE: cosyvoice/bin/train.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
import argparse
import datetime
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from copy import deepcopy
import torch
import torch.distributed as dist
import deepspeed

from hyperpyyaml import load_hyperpyyaml

from torch.distributed.elastic.multiprocessing.errors import record

from cosyvoice.utils.executor import Executor
from cosyvoice.utils.train_utils import (
    init_distributed,
    init_dataset_and_dataloader,
    init_optimizer_and_scheduler,
    init_summarywriter, save_model,
    wrap_cuda_model, check_modify_and_save_config)


def get_args():
    parser = argparse.ArgumentParser(description='training your network')
    parser.add_argument('--train_engine',
                        default='torch_ddp',
                        choices=['torch_ddp', 'deepspeed'],
                        help='Engine for paralleled training')
    parser.add_argument('--model', required=True, help='model which will be trained')
    parser.add_argument('--config', required=True, help='config file')
    parser.add_argument('--train_data', required=True, help='train data file')
    parser.add_argument('--cv_data', required=True, help='cv data file')
    parser.add_argument('--checkpoint', help='checkpoint model')
    parser.add_argument('--model_dir', required=True, help='save model dir')
    parser.add_argument('--tensorboard_dir',
                        default='tensorboard',
                        help='tensorboard log dir')
    parser.add_argument('--ddp.dist_backend',
                        dest='dist_backend',
                        default='nccl',
                        choices=['nccl', 'gloo'],
                        help='distributed backend')
    parser.add_argument('--num_workers',
                        default=0,
                        type=int,
                        help='num of subprocess workers for reading')
    parser.add_argument('--prefetch',
                        default=100,
                        type=int,
                        help='prefetch number')
    parser.add_argument('--pin_memory',
                        action='store_true',
                        default=False,
                        help='Use pinned memory buffers used for reading')
    parser.add_argument('--deepspeed.save_states',
                        dest='save_states',
                        default='model_only',
                        choices=['model_only', 'model+optimizer'],
                        help='save model/optimizer states')
    parser.add_argument('--timeout',
                        default=30,
                        type=int,
                        help='timeout (in seconds) of cosyvoice_join.')
    parser = deepspeed.add_config_arguments(parser)
    args = parser.parse_args()
    return args


@record
def main():
    args = get_args()
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s %(message)s')

    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
    with open(args.config, 'r') as f:
        configs = load_hyperpyyaml(f, overrides=override_dict)
    configs['train_conf'].update(vars(args))

    # Init env for ddp
    init_distributed(args)

    # Get dataset & dataloader
    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
        init_dataset_and_dataloader(args, configs)

    # Do some sanity checks and save config to arsg.model_dir
    configs = check_modify_and_save_config(args, configs)

    # Tensorboard summary
    writer = init_summarywriter(args)

    # load checkpoint
    model = configs[args.model]
    if args.checkpoint is not None:
        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))

    # Dispatch model from cpu to gpu
    model = wrap_cuda_model(args, model)

    # Get optimizer & scheduler
    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)

    # Save init checkpoints
    info_dict = deepcopy(configs['train_conf'])
    save_model(model, 'init', info_dict)

    # Get executor
    executor = Executor()

    # Start training loop
    for epoch in range(info_dict['max_epoch']):
        executor.epoch = epoch
        train_dataset.set_epoch(epoch)
        dist.barrier()
        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
        executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join)
        dist.destroy_process_group(group_join)

if __name__ == '__main__':
    main()


================================================
FILE: cosyvoice/cli/__init__.py
================================================


================================================
FILE: cosyvoice/cli/cosyvoice.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from hyperpyyaml import load_hyperpyyaml
from huggingface_hub import snapshot_download
from cosyvoice.cli.frontend import CosyVoiceFrontEnd
from cosyvoice.cli.model import CosyVoiceModel

class CosyVoice:

    def __init__(self, model_dir):
        instruct = True if '-Instruct' in model_dir else False
        self.model_dir = model_dir
        if not os.path.exists(model_dir):
            model_dir = snapshot_download(model_dir)
        with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
            configs = load_hyperpyyaml(f)
        self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
                                          configs['feat_extractor'],
                                          '{}/campplus.onnx'.format(model_dir),
                                          '{}/speech_tokenizer_v1.onnx'.format(model_dir),
                                          '{}/spk2info.pt'.format(model_dir),
                                          instruct,
                                          configs['allowed_special'])
        self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
        self.model.load('{}/llm.pt'.format(model_dir),
                        '{}/flow.pt'.format(model_dir),
                        '{}/hift.pt'.format(model_dir))
        del configs

    def list_avaliable_spks(self):
        spks = list(self.frontend.spk2info.keys())
        return spks

    def inference_sft(self, tts_text, spk_id):
        tts_speeches = []
        for i in self.frontend.text_normalize(tts_text, split=True):
            model_input = self.frontend.frontend_sft(i, spk_id)
            model_output = self.model.inference(**model_input)
            tts_speeches.append(model_output['tts_speech'])
        return {'tts_speech': torch.concat(tts_speeches, dim=1)}

    def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
        tts_speeches = []
        for i in self.frontend.text_normalize(tts_text, split=True):
            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_speech_16k)
            model_output = self.model.inference(**model_input)
            tts_speeches.append(model_output['tts_speech'])
        return {'tts_speech': torch.concat(tts_speeches, dim=1)}

    def inference_cross_lingual(self, tts_text, prompt_speech_16k):
        if self.frontend.instruct is True:
            raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
        tts_speeches = []
        for i in self.frontend.text_normalize(tts_text, split=True):
            model_input = self.frontend.frontend_cross_lingual(i, prompt_speech_16k)
            model_output = self.model.inference(**model_input)
            tts_speeches.append(model_output['tts_speech'])
        return {'tts_speech': torch.concat(tts_speeches, dim=1)}

    def inference_instruct(self, tts_text, spk_id, instruct_text):
        if self.frontend.instruct is False:
            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
        tts_speeches = []
        for i in self.frontend.text_normalize(tts_text, split=True):
            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
            model_output = self.model.inference(**model_input)
            tts_speeches.append(model_output['tts_speech'])
        return {'tts_speech': torch.concat(tts_speeches, dim=1)}


================================================
FILE: cosyvoice/cli/frontend.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
import onnxruntime
import torch
import numpy as np
import whisper
from typing import Callable
import torchaudio.compliance.kaldi as kaldi
import torchaudio
import os
import re
import inflect
try:
    import ttsfrd
    use_ttsfrd = True
except ImportError:
    print("failed to import ttsfrd, use WeTextProcessing instead")
    from tn.chinese.normalizer import Normalizer as ZhNormalizer
    from tn.english.normalizer import Normalizer as EnNormalizer
    use_ttsfrd = False
from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph


class CosyVoiceFrontEnd:

    def __init__(self,
                 get_tokenizer: Callable,
                 feat_extractor: Callable,
                 model_dir: str,
                 campplus_model: str,
                 speech_tokenizer_model: str,
                 spk2info: str = '',
                 instruct: bool = False,
                 allowed_special: str = 'all'):
        self.tokenizer = get_tokenizer()
        self.feat_extractor = feat_extractor
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        option = onnxruntime.SessionOptions()
        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        option.intra_op_num_threads = 1
        self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
        self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, providers=["CUDAExecutionProvider"if torch.cuda.is_available() else "CPUExecutionProvider"])
        if os.path.exists(spk2info):
            self.spk2info = torch.load(spk2info, map_location=self.device)
        self.instruct = instruct
        self.allowed_special = allowed_special
        self.inflect_parser = inflect.engine()
        self.use_ttsfrd = use_ttsfrd
        if self.use_ttsfrd:
            self.frd = ttsfrd.TtsFrontendEngine()
            ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
            assert self.frd.initialize('{}/CosyVoice-ttsfrd/resource'.format(model_dir)) is True, 'failed to initialize ttsfrd resource'
            self.frd.set_lang_type('pinyin')
            self.frd.enable_pinyin_mix(True)
            self.frd.set_breakmodel_index(1)
        else:
            self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
            self.en_tn_model = EnNormalizer()

    def _extract_text_token(self, text):
        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
        return text_token, text_token_len

    def _extract_speech_token(self, speech):
        feat = whisper.log_mel_spectrogram(speech, n_mels=128)
        speech_token = self.speech_tokenizer_session.run(None, {self.speech_tokenizer_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
                                                                self.speech_tokenizer_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
        speech_token = torch.tensor([speech_token], dtype=torch.int32).to(self.device)
        speech_token_len = torch.tensor([speech_token.shape[1]], dtype=torch.int32).to(self.device)
        return speech_token, speech_token_len

    def _extract_spk_embedding(self, speech):
        feat = kaldi.fbank(speech,
                           num_mel_bins=80,
                           dither=0,
                           sample_frequency=16000)
        feat = feat - feat.mean(dim=0, keepdim=True)
        embedding = self.campplus_session.run(None, {self.campplus_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
        embedding = torch.tensor([embedding]).to(self.device)
        return embedding

    def _extract_speech_feat(self, speech):
        speech_feat = self.feat_extractor(speech).squeeze(dim=0).transpose(0, 1).to(self.device)
        speech_feat = speech_feat.unsqueeze(dim=0)
        speech_feat_len = torch.tensor([speech_feat.shape[1]], dtype=torch.int32).to(self.device)
        return speech_feat, speech_feat_len

    def text_normalize(self, text, split=True):
        text = text.strip()
        if contains_chinese(text):
            if self.use_ttsfrd:
                text = self.frd.get_frd_extra_info(text, 'input')
            else:
                text = self.zh_tn_model.normalize(text)
            text = text.replace("\n", "")
            text = replace_blank(text)
            text = replace_corner_mark(text)
            text = text.replace(".", "、")
            text = text.replace(" - ", ",")
            text = remove_bracket(text)
            text = re.sub(r'[,,]+$', '。', text)
            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
                                                token_min_n=60, merge_len=20,
                                                comma_split=False)]
        else:
            if self.use_ttsfrd:
                text = self.frd.get_frd_extra_info(text, 'input')
            else:
                text = self.en_tn_model.normalize(text)
            text = spell_out_number(text, self.inflect_parser)
            texts = [i for i in split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
                                                token_min_n=60, merge_len=20,
                                                comma_split=False)]
        if split is False:
            return text
        return texts

    def frontend_sft(self, tts_text, spk_id):
        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
        embedding = self.spk2info[spk_id]['embedding']
        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len, 'llm_embedding': embedding, 'flow_embedding': embedding}
        return model_input

    def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
        tts_text_token, tts_text_token_len = self._extract_text_token(tts_text)
        prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
        prompt_speech_22050 = torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050)(prompt_speech_16k)
        speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_22050)
        speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
        embedding = self._extract_spk_embedding(prompt_speech_16k)
        model_input = {'text': tts_text_token, 'text_len': tts_text_token_len,
                       'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
                       'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
                       'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
                       'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
                       'llm_embedding': embedding, 'flow_embedding': embedding}
        return model_input

    def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
        model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k)
        # in cross lingual mode, we remove prompt in llm
        del model_input['prompt_text']
        del model_input['prompt_text_len']
        del model_input['llm_prompt_speech_token']
        del model_input['llm_prompt_speech_token_len']
        return model_input

    def frontend_instruct(self, tts_text, spk_id, instruct_text):
        model_input = self.frontend_sft(tts_text, spk_id)
        # in instruct mode, we remove spk_embedding in llm due to information leakage
        del model_input['llm_embedding']
        instruct_text_token, instruct_text_token_len = self._extract_text_token(instruct_text + '<endofprompt>')
        model_input['prompt_text'] = instruct_text_token
        model_input['prompt_text_len'] = instruct_text_token_len
        return model_input


================================================
FILE: cosyvoice/cli/model.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch

class CosyVoiceModel:

    def __init__(self,
                 llm: torch.nn.Module,
                 flow: torch.nn.Module,
                 hift: torch.nn.Module):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.llm = llm
        self.flow = flow
        self.hift = hift

    def load(self, llm_model, flow_model, hift_model):
        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
        self.llm.to(self.device).eval()
        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
        self.flow.to(self.device).eval()
        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
        self.hift.to(self.device).eval()

    def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
                  prompt_text=torch.zeros(1, 0, dtype=torch.int32), prompt_text_len=torch.zeros(1, dtype=torch.int32),
                  llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                  flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                  prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
        tts_speech_token = self.llm.inference(text=text.to(self.device),
                                              text_len=text_len.to(self.device),
                                              prompt_text=prompt_text.to(self.device),
                                              prompt_text_len=prompt_text_len.to(self.device),
                                              prompt_speech_token=llm_prompt_speech_token.to(self.device),
                                              prompt_speech_token_len=llm_prompt_speech_token_len.to(self.device),
                                              embedding=llm_embedding.to(self.device),
                                              beam_size=1,
                                              sampling=25,
                                              max_token_text_ratio=30,
                                              min_token_text_ratio=3)
        tts_mel = self.flow.inference(token=tts_speech_token,
                                      token_len=torch.tensor([tts_speech_token.size(1)], dtype=torch.int32).to(self.device),
                                      prompt_token=flow_prompt_speech_token.to(self.device),
                                      prompt_token_len=flow_prompt_speech_token_len.to(self.device),
                                      prompt_feat=prompt_speech_feat.to(self.device),
                                      prompt_feat_len=prompt_speech_feat_len.to(self.device),
                                      embedding=flow_embedding.to(self.device))
        tts_speech = self.hift.inference(mel=tts_mel).cpu()
        torch.cuda.empty_cache()
        return {'tts_speech': tts_speech}


================================================
FILE: cosyvoice/dataset/__init__.py
================================================


================================================
FILE: cosyvoice/dataset/dataset.py
================================================
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
#               2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import random
import json
import math
from functools import partial

import torch
import torch.distributed as dist
from torch.utils.data import IterableDataset
from cosyvoice.utils.file_utils import read_lists, read_json_lists


class Processor(IterableDataset):

    def __init__(self, source, f, *args, **kw):
        assert callable(f)
        self.source = source
        self.f = f
        self.args = args
        self.kw = kw

    def set_epoch(self, epoch):
        self.source.set_epoch(epoch)

    def __iter__(self):
        """ Return an iterator over the source dataset processed by the
            given processor.
        """
        assert self.source is not None
        assert callable(self.f)
        return self.f(iter(self.source), *self.args, **self.kw)

    def apply(self, f):
        assert callable(f)
        return Processor(self, f, *self.args, **self.kw)


class DistributedSampler:

    def __init__(self, shuffle=True, partition=True):
        self.epoch = -1
        self.update()
        self.shuffle = shuffle
        self.partition = partition

    def update(self):
        assert dist.is_available()
        if dist.is_initialized():
            self.rank = dist.get_rank()
            self.world_size = dist.get_world_size()
        else:
            self.rank = 0
            self.world_size = 1
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            self.worker_id = 0
            self.num_workers = 1
        else:
            self.worker_id = worker_info.id
            self.num_workers = worker_info.num_workers
        return dict(rank=self.rank,
                    world_size=self.world_size,
                    worker_id=self.worker_id,
                    num_workers=self.num_workers)

    def set_epoch(self, epoch):
        self.epoch = epoch

    def sample(self, data):
        """ Sample data according to rank/world_size/num_workers

            Args:
                data(List): input data list

            Returns:
                List: data list after sample
        """
        data = list(range(len(data)))
        # force datalist even
        if self.partition:
            if self.shuffle:
                random.Random(self.epoch).shuffle(data)
            if len(data) < self.world_size:
                data = data * math.ceil(self.world_size / len(data))
                data = data[:self.world_size]
            data = data[self.rank::self.world_size]
        if len(data) < self.num_workers:
            data = data * math.ceil(self.num_workers / len(data))
            data = data[:self.num_workers]
        data = data[self.worker_id::self.num_workers]
        return data


class DataList(IterableDataset):

    def __init__(self, lists, shuffle=True, partition=True):
        self.lists = lists
        self.sampler = DistributedSampler(shuffle, partition)

    def set_epoch(self, epoch):
        self.sampler.set_epoch(epoch)

    def __iter__(self):
        sampler_info = self.sampler.update()
        indexes = self.sampler.sample(self.lists)
        for index in indexes:
            data = dict(src=self.lists[index])
            data.update(sampler_info)
            yield data


def Dataset(data_list_file,
            data_pipeline,
            mode='train',
            shuffle=True,
            partition=True,
            tts_file='',
            prompt_utt2data=''):
    """ Construct dataset from arguments

        We have two shuffle stage in the Dataset. The first is global
        shuffle at shards tar/raw file level. The second is global shuffle
        at training samples level.

        Args:
            data_type(str): raw/shard
            tokenizer (BaseTokenizer): tokenizer to tokenize
            partition(bool): whether to do data partition in terms of rank
    """
    assert mode in ['train', 'inference']
    lists = read_lists(data_list_file)
    if mode == 'inference':
        with open(tts_file) as f:
            tts_data = json.load(f)
        utt2lists = read_json_lists(prompt_utt2data)
        # filter unnecessary file in inference mode
        lists = list(set([utt2lists[utt] for utt in tts_data.keys() if utt2lists[utt] in lists]))
    dataset = DataList(lists,
                       shuffle=shuffle,
                       partition=partition)
    if mode == 'inference':
        # map partial arg tts_data in inference mode
        data_pipeline[0] = partial(data_pipeline[0], tts_data=tts_data)
    for func in data_pipeline:
        dataset = Processor(dataset, func, mode=mode)
    return dataset


================================================
FILE: cosyvoice/dataset/processor.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random

import pyarrow.parquet as pq
from io import BytesIO
import torch
import torchaudio
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

torchaudio.set_audio_backend('soundfile')

AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])


def parquet_opener(data, mode='train', tts_data={}):
    """ Give url or local file, return file descriptor
        Inplace operation.

        Args:
            data(Iterable[str]): url or local file list

        Returns:
            Iterable[{src, stream}]
    """
    for sample in data:
        assert 'src' in sample
        url = sample['src']
        try:
            df = pq.read_table(url).to_pandas()
            for i in range(len(df)):
                if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
                    continue
                sample.update(dict(df.loc[i]))
                if mode == 'train':
                    # NOTE do not return sample directly, must initialize a new dict
                    yield {**sample}
                else:
                    for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
                        yield {**sample, 'tts_index': index, 'tts_text': text}
        except Exception as ex:
            logging.warning('Failed to open {}, ex info {}'.format(url, ex))

def filter(data,
           max_length=10240,
           min_length=10,
           token_max_length=200,
           token_min_length=1,
           min_output_input_ratio=0.0005,
           max_output_input_ratio=1,
           mode='train'):
    """ Filter sample according to feature and label length
        Inplace operation.

        Args::
            data: Iterable[{key, wav, label, sample_rate}]
            max_length: drop utterance which is greater than max_length(10ms)
            min_length: drop utterance which is less than min_length(10ms)
            token_max_length: drop utterance which is greater than
                token_max_length, especially when use char unit for
                english modeling
            token_min_length: drop utterance which is
                less than token_max_length
            min_output_input_ratio: minimal ration of
                token_length / feats_length(10ms)
            max_output_input_ratio: maximum ration of
                token_length / feats_length(10ms)

        Returns:
            Iterable[{key, wav, label, sample_rate}]
    """
    for sample in data:
        sample['speech'], sample['sample_rate'] = torchaudio.load(BytesIO(sample['audio_data']))
        del sample['audio_data']
        # sample['wav'] is torch.Tensor, we have 100 frames every second
        num_frames = sample['speech'].size(1) / sample['sample_rate'] * 100
        if num_frames < min_length:
            continue
        if num_frames > max_length:
            continue
        if len(sample['text_token']) < token_min_length:
            continue
        if len(sample['text_token']) > token_max_length:
            continue
        if len(sample['speech_token']) == 0:
            continue
        if num_frames != 0:
            if len(sample['text_token']) / num_frames < min_output_input_ratio:
                continue
            if len(sample['text_token']) / num_frames > max_output_input_ratio:
                continue
        yield sample


def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
    """ Resample data.
        Inplace operation.

        Args:
            data: Iterable[{key, wav, label, sample_rate}]
            resample_rate: target resample rate

        Returns:
            Iterable[{key, wav, label, sample_rate}]
    """
    for sample in data:
        assert 'sample_rate' in sample
        assert 'speech' in sample
        sample_rate = sample['sample_rate']
        waveform = sample['speech']
        if sample_rate != resample_rate:
            if sample_rate < min_sample_rate:
                continue
            sample['sample_rate'] = resample_rate
            sample['speech'] = torchaudio.transforms.Resample(
                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
        max_val = sample['speech'].abs().max()
        if max_val > 1:
            sample['speech'] /= max_val
        yield sample


def compute_fbank(data,
                  feat_extractor,
                  mode='train'):
    """ Extract fbank

        Args:
            data: Iterable[{key, wav, label, sample_rate}]

        Returns:
            Iterable[{key, feat, label}]
    """
    for sample in data:
        assert 'sample_rate' in sample
        assert 'speech' in sample
        assert 'utt' in sample
        assert 'text_token' in sample
        waveform = sample['speech']
        mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
        sample['speech_feat'] = mat
        del sample['speech']
        yield sample


def parse_embedding(data, normalize, mode='train'):
    """ Parse utt_embedding/spk_embedding

        Args:
            data: Iterable[{key, wav, label, sample_rate}]

        Returns:
            Iterable[{key, feat, label}]
    """
    for sample in data:
        sample['utt_embedding'] = torch.tensor(sample['utt_embedding'], dtype=torch.float32)
        sample['spk_embedding'] = torch.tensor(sample['spk_embedding'], dtype=torch.float32)
        if normalize:
            sample['utt_embedding'] = F.normalize(sample['utt_embedding'], dim=0)
            sample['spk_embedding'] = F.normalize(sample['spk_embedding'], dim=0)
        yield sample


def tokenize(data, get_tokenizer, allowed_special, mode='train'):
    """ Decode text to chars or BPE
        Inplace operation

        Args:
            data: Iterable[{key, wav, txt, sample_rate}]

        Returns:
            Iterable[{key, wav, txt, tokens, label, sample_rate}]
    """
    tokenizer = get_tokenizer()
    for sample in data:
        assert 'text' in sample
        sample['text_token'] = tokenizer.encode(sample['text'], allowed_special=allowed_special)
        if mode == 'inference':
            sample['tts_text_token'] = tokenizer.encode(sample['tts_text'], allowed_special=allowed_special)
        yield sample


def shuffle(data, shuffle_size=10000, mode='train'):
    """ Local shuffle the data

        Args:
            data: Iterable[{key, feat, label}]
            shuffle_size: buffer size for shuffle

        Returns:
            Iterable[{key, feat, label}]
    """
    buf = []
    for sample in data:
        buf.append(sample)
        if len(buf) >= shuffle_size:
            random.shuffle(buf)
            for x in buf:
                yield x
            buf = []
    # The sample left over
    random.shuffle(buf)
    for x in buf:
        yield x


def sort(data, sort_size=500, mode='train'):
    """ Sort the data by feature length.
        Sort is used after shuffle and before batch, so we can group
        utts with similar lengths into a batch, and `sort_size` should
        be less than `shuffle_size`

        Args:
            data: Iterable[{key, feat, label}]
            sort_size: buffer size for sort

        Returns:
            Iterable[{key, feat, label}]
    """

    buf = []
    for sample in data:
        buf.append(sample)
        if len(buf) >= sort_size:
            buf.sort(key=lambda x: x['speech_feat'].size(0))
            for x in buf:
                yield x
            buf = []
    # The sample left over
    buf.sort(key=lambda x: x['speech_feat'].size(0))
    for x in buf:
        yield x


def static_batch(data, batch_size=16):
    """ Static batch the data by `batch_size`

        Args:
            data: Iterable[{key, feat, label}]
            batch_size: batch size

        Returns:
            Iterable[List[{key, feat, label}]]
    """
    buf = []
    for sample in data:
        buf.append(sample)
        if len(buf) >= batch_size:
            yield buf
            buf = []
    if len(buf) > 0:
        yield buf


def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
    """ Dynamic batch the data until the total frames in batch
        reach `max_frames_in_batch`

        Args:
            data: Iterable[{key, feat, label}]
            max_frames_in_batch: max_frames in one batch

        Returns:
            Iterable[List[{key, feat, label}]]
    """
    buf = []
    longest_frames = 0
    for sample in data:
        assert 'speech_feat' in sample
        assert isinstance(sample['speech_feat'], torch.Tensor)
        new_sample_frames = sample['speech_feat'].size(0)
        longest_frames = max(longest_frames, new_sample_frames)
        frames_after_padding = longest_frames * (len(buf) + 1)
        if frames_after_padding > max_frames_in_batch:
            yield buf
            buf = [sample]
            longest_frames = new_sample_frames
        else:
            buf.append(sample)
    if len(buf) > 0:
        yield buf


def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000, mode='train'):
    """ Wrapper for static/dynamic batch
    """
    if mode == 'inference':
        return static_batch(data, 1)
    else:
        if batch_type == 'static':
            return static_batch(data, batch_size)
        elif batch_type == 'dynamic':
            return dynamic_batch(data, max_frames_in_batch)
        else:
            logging.fatal('Unsupported batch type {}'.format(batch_type))


def padding(data, use_spk_embedding, mode='train'):
    """ Padding the data into training data

        Args:
            data: Iterable[List[{key, feat, label}]]

        Returns:
            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
    """
    for sample in data:
        assert isinstance(sample, list)
        speech_feat_len = torch.tensor([x['speech_feat'].size(1) for x in sample],
                                       dtype=torch.int32)
        order = torch.argsort(speech_feat_len, descending=True)

        utts = [sample[i]['utt'] for i in order]
        speech_token = [torch.tensor(sample[i]['speech_token']) for i in order]
        speech_token_len = torch.tensor([i.size(0) for i in speech_token], dtype=torch.int32)
        speech_token = pad_sequence(speech_token,
                                    batch_first=True,
                                    padding_value=0)
        speech_feat = [sample[i]['speech_feat'] for i in order]
        speech_feat_len = torch.tensor([i.size(0) for i in speech_feat], dtype=torch.int32)
        speech_feat = pad_sequence(speech_feat,
                                   batch_first=True,
                                   padding_value=0)
        text = [sample[i]['text'] for i in order]
        text_token = [torch.tensor(sample[i]['text_token']) for i in order]
        text_token_len = torch.tensor([i.size(0) for i in text_token], dtype=torch.int32)
        text_token = pad_sequence(text_token, batch_first=True, padding_value=0)
        utt_embedding = torch.stack([sample[i]['utt_embedding'] for i in order], dim=0)
        spk_embedding = torch.stack([sample[i]['spk_embedding'] for i in order], dim=0)
        batch = {
            "utts": utts,
            "speech_token": speech_token,
            "speech_token_len": speech_token_len,
            "speech_feat": speech_feat,
            "speech_feat_len": speech_feat_len,
            "text": text,
            "text_token": text_token,
            "text_token_len": text_token_len,
            "utt_embedding": utt_embedding,
            "spk_embedding": spk_embedding,
        }
        if mode == 'inference':
            tts_text = [sample[i]['tts_text'] for i in order]
            tts_index = [sample[i]['tts_index'] for i in order]
            tts_text_token = [torch.tensor(sample[i]['tts_text_token']) for i in order]
            tts_text_token_len = torch.tensor([i.size(0) for i in tts_text_token], dtype=torch.int32)
            tts_text_token = pad_sequence(tts_text_token, batch_first=True, padding_value=-1)
            batch.update({'tts_text': tts_text,
                          'tts_index': tts_index,
                          'tts_text_token': tts_text_token,
                          'tts_text_token_len': tts_text_token_len})
        if use_spk_embedding is True:
            batch["embedding"] = batch["spk_embedding"]
        else:
            batch["embedding"] = batch["utt_embedding"]
        yield batch


================================================
FILE: cosyvoice/flow/decoder.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from einops import pack, rearrange, repeat
from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
from matcha.models.components.transformer import BasicTransformerBlock


class ConditionalDecoder(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        channels=(256, 256),
        dropout=0.05,
        attention_head_dim=64,
        n_blocks=1,
        num_mid_blocks=2,
        num_heads=4,
        act_fn="snake",
    ):
        """
        This decoder requires an input with the same shape of the target. So, if your text content
        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
        """
        super().__init__()
        channels = tuple(channels)
        self.in_channels = in_channels
        self.out_channels = out_channels

        self.time_embeddings = SinusoidalPosEmb(in_channels)
        time_embed_dim = channels[0] * 4
        self.time_mlp = TimestepEmbedding(
            in_channels=in_channels,
            time_embed_dim=time_embed_dim,
            act_fn="silu",
        )
        self.down_blocks = nn.ModuleList([])
        self.mid_blocks = nn.ModuleList([])
        self.up_blocks = nn.ModuleList([])

        output_channel = in_channels
        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
            input_channel = output_channel
            output_channel = channels[i]
            is_last = i == len(channels) - 1
            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
            transformer_blocks = nn.ModuleList(
                [
                    BasicTransformerBlock(
                        dim=output_channel,
                        num_attention_heads=num_heads,
                        attention_head_dim=attention_head_dim,
                        dropout=dropout,
                        activation_fn=act_fn,
                    )
                    for _ in range(n_blocks)
                ]
            )
            downsample = (
                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
            )
            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

        for i in range(num_mid_blocks):
            input_channel = channels[-1]
            out_channels = channels[-1]
            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

            transformer_blocks = nn.ModuleList(
                [
                    BasicTransformerBlock(
                        dim=output_channel,
                        num_attention_heads=num_heads,
                        attention_head_dim=attention_head_dim,
                        dropout=dropout,
                        activation_fn=act_fn,
                    )
                    for _ in range(n_blocks)
                ]
            )

            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

        channels = channels[::-1] + (channels[0],)
        for i in range(len(channels) - 1):
            input_channel = channels[i] * 2
            output_channel = channels[i + 1]
            is_last = i == len(channels) - 2
            resnet = ResnetBlock1D(
                dim=input_channel,
                dim_out=output_channel,
                time_emb_dim=time_embed_dim,
            )
            transformer_blocks = nn.ModuleList(
                [
                    BasicTransformerBlock(
                        dim=output_channel,
                        num_attention_heads=num_heads,
                        attention_head_dim=attention_head_dim,
                        dropout=dropout,
                        activation_fn=act_fn,
                    )
                    for _ in range(n_blocks)
                ]
            )
            upsample = (
                Upsample1D(output_channel, use_conv_transpose=True)
                if not is_last
                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
            )
            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
        self.final_block = Block1D(channels[-1], channels[-1])
        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
        self.initialize_weights()


    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.GroupNorm):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x, mask, mu, t, spks=None, cond=None):
        """Forward pass of the UNet1DConditional model.

        Args:
            x (torch.Tensor): shape (batch_size, in_channels, time)
            mask (_type_): shape (batch_size, 1, time)
            t (_type_): shape (batch_size)
            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
            cond (_type_, optional): placeholder for future use. Defaults to None.

        Raises:
            ValueError: _description_
            ValueError: _description_

        Returns:
            _type_: _description_
        """

        t = self.time_embeddings(t)
        t = self.time_mlp(t)

        x = pack([x, mu], "b * t")[0]

        if spks is not None:
            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
            x = pack([x, spks], "b * t")[0]
        if cond is not None:
            x = pack([x, cond], "b * t")[0]

        hiddens = []
        masks = [mask]
        for resnet, transformer_blocks, downsample in self.down_blocks:
            mask_down = masks[-1]
            x = resnet(x, mask_down, t)
            x = rearrange(x, "b c t -> b t c").contiguous()
            attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
            for transformer_block in transformer_blocks:
                x = transformer_block(
                    hidden_states=x,
                    attention_mask=attn_mask,
                    timestep=t,
                )
            x = rearrange(x, "b t c -> b c t").contiguous()
            hiddens.append(x)  # Save hidden states for skip connections
            x = downsample(x * mask_down)
            masks.append(mask_down[:, :, ::2])
        masks = masks[:-1]
        mask_mid = masks[-1]

        for resnet, transformer_blocks in self.mid_blocks:
            x = resnet(x, mask_mid, t)
            x = rearrange(x, "b c t -> b t c").contiguous()
            attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
            for transformer_block in transformer_blocks:
                x = transformer_block(
                    hidden_states=x,
                    attention_mask=attn_mask,
                    timestep=t,
                )
            x = rearrange(x, "b t c -> b c t").contiguous()

        for resnet, transformer_blocks, upsample in self.up_blocks:
            mask_up = masks.pop()
            skip = hiddens.pop()
            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
            x = resnet(x, mask_up, t)
            x = rearrange(x, "b c t -> b t c").contiguous()
            attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
            for transformer_block in transformer_blocks:
                x = transformer_block(
                    hidden_states=x,
                    attention_mask=attn_mask,
                    timestep=t,
                )
            x = rearrange(x, "b t c -> b c t").contiguous()
            x = upsample(x * mask_up)
        x = self.final_block(x, mask_up)
        output = self.final_proj(x * mask_up)
        return output * mask


================================================
FILE: cosyvoice/flow/flow.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import random
from typing import Dict, Optional
import torch
import torch.nn as nn
from torch.nn import functional as F
from omegaconf import DictConfig
from cosyvoice.utils.mask import make_pad_mask


class MaskedDiffWithXvec(torch.nn.Module):
    def __init__(self,
                 input_size: int = 512,
                 output_size: int = 80,
                 spk_embed_dim: int = 192,
                 output_type: str = "mel",
                 vocab_size: int = 4096,
                 input_frame_rate: int = 50,
                 only_mask_loss: bool = True,
                 encoder: torch.nn.Module = None,
                 length_regulator: torch.nn.Module = None,
                 decoder: torch.nn.Module = None,
                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80, 'spk_emb_dim': 80, 'n_spks': 1, 'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine', 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}), 'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64, 'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 80, 'sampling_rate': 22050, 'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 8000}):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.decoder_conf = decoder_conf
        self.mel_feat_conf = mel_feat_conf
        self.vocab_size = vocab_size
        self.output_type = output_type
        self.input_frame_rate = input_frame_rate
        logging.info(f"input frame rate={self.input_frame_rate}")
        self.input_embedding = nn.Embedding(vocab_size, input_size)
        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, output_size)
        self.encoder = encoder
        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
        self.decoder = decoder
        self.length_regulator = length_regulator
        self.only_mask_loss = only_mask_loss

    def forward(
            self,
            batch: dict,
            device: torch.device,
    ) -> Dict[str, Optional[torch.Tensor]]:
        token = batch['speech_token'].to(device)
        token_len = batch['speech_token_len'].to(device)
        feat = batch['speech_feat'].to(device)
        feat_len = batch['speech_feat_len'].to(device)
        embedding = batch['embedding'].to(device)

        # xvec projection
        embedding = F.normalize(embedding, dim=1)
        embedding = self.spk_embed_affine_layer(embedding)

        # concat text and prompt_text
        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(device)
        token = self.input_embedding(torch.clamp(token, min=0)) * mask

        # text encode
        h, h_lengths = self.encoder(token, token_len)
        h = self.encoder_proj(h)
        h, h_lengths = self.length_regulator(h, feat_len)

        # get conditions
        conds = torch.zeros(feat.shape, device=token.device)
        for i, j in enumerate(feat_len):
            if random.random() < 0.5:
                continue
            index = random.randint(0, int(0.3 * j))
            conds[i, :index] = feat[i, :index]
        conds = conds.transpose(1, 2)

        mask = (~make_pad_mask(feat_len)).to(h)
        feat = F.interpolate(feat.unsqueeze(dim=1), size=h.shape[1:], mode="nearest").squeeze(dim=1)
        loss, _ = self.decoder.compute_loss(
            feat.transpose(1, 2).contiguous(),
            mask.unsqueeze(1),
            h.transpose(1, 2).contiguous(),
            embedding,
            cond=conds
        )
        return {'loss': loss}

    @torch.inference_mode()
    def inference(self,
                  token,
                  token_len,
                  prompt_token,
                  prompt_token_len,
                  prompt_feat,
                  prompt_feat_len,
                  embedding):
        assert token.shape[0] == 1
        # xvec projection
        embedding = F.normalize(embedding, dim=1)
        embedding = self.spk_embed_affine_layer(embedding)

        # concat text and prompt_text
        token, token_len = torch.concat([prompt_token, token], dim=1), prompt_token_len + token_len
        mask = (~make_pad_mask(token_len)).float().unsqueeze(-1).to(embedding)
        token = self.input_embedding(torch.clamp(token, min=0)) * mask

        # text encode
        h, h_lengths = self.encoder(token, token_len)
        h = self.encoder_proj(h)
        feat_len = (token_len / 50 * 22050 / 256).int()
        h, h_lengths = self.length_regulator(h, feat_len)

        # get conditions
        conds = torch.zeros([1, feat_len.max().item(), self.output_size], device=token.device)
        if prompt_feat.shape[1] != 0:
            for i, j in enumerate(prompt_feat_len):
                conds[i, :j] = prompt_feat[i]
        conds = conds.transpose(1, 2)

        mask = (~make_pad_mask(feat_len)).to(h)
        feat = self.decoder(
            mu=h.transpose(1, 2).contiguous(),
            mask=mask.unsqueeze(1),
            spks=embedding,
            cond=conds,
            n_timesteps=10
        )
        if prompt_feat.shape[1] != 0:
            feat = feat[:, :, prompt_feat.shape[1]:]
        return feat


================================================
FILE: cosyvoice/flow/flow_matching.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn.functional as F
from matcha.models.components.flow_matching import BASECFM

class ConditionalCFM(BASECFM):
    def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: torch.nn.Module = None):
        super().__init__(
            n_feats=in_channels,
            cfm_params=cfm_params,
            n_spks=n_spks,
            spk_emb_dim=spk_emb_dim,
        )
        self.t_scheduler = cfm_params.t_scheduler
        self.training_cfg_rate = cfm_params.training_cfg_rate
        self.inference_cfg_rate = cfm_params.inference_cfg_rate
        in_channels = in_channels + (spk_emb_dim if n_spks > 0 else 0)
        # Just change the architecture of the estimator here
        self.estimator = estimator

    @torch.inference_mode()
    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
        """Forward diffusion

        Args:
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            n_timesteps (int): number of diffusion steps
            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes

        Returns:
            sample: generated mel-spectrogram
                shape: (batch_size, n_feats, mel_timesteps)
        """
        z = torch.randn_like(mu) * temperature
        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
        if self.t_scheduler == 'cosine':
            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)

    def solve_euler(self, x, t_span, mu, mask, spks, cond):
        """
        Fixed euler solver for ODEs.
        Args:
            x (torch.Tensor): random noise
            t_span (torch.Tensor): n_timesteps interpolated
                shape: (n_timesteps + 1,)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): output_mask
                shape: (batch_size, 1, mel_timesteps)
            spks (torch.Tensor, optional): speaker ids. Defaults to None.
                shape: (batch_size, spk_emb_dim)
            cond: Not used but kept for future purposes
        """
        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]

        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
        # Or in future might add like a return_all_steps flag
        sol = []

        for step in range(1, len(t_span)):
            dphi_dt = self.estimator(x, mask, mu, t, spks, cond)
            # Classifier-Free Guidance inference introduced in VoiceBox
            if self.inference_cfg_rate > 0:
                cfg_dphi_dt = self.estimator(
                    x, mask,
                    torch.zeros_like(mu), t,
                    torch.zeros_like(spks) if spks is not None else None,
                    torch.zeros_like(cond)
                )
                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
                           self.inference_cfg_rate * cfg_dphi_dt)
            x = x + dt * dphi_dt
            t = t + dt
            sol.append(x)
            if step < len(t_span) - 1:
                dt = t_span[step + 1] - t

        return sol[-1]

    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
        """Computes diffusion loss

        Args:
            x1 (torch.Tensor): Target
                shape: (batch_size, n_feats, mel_timesteps)
            mask (torch.Tensor): target mask
                shape: (batch_size, 1, mel_timesteps)
            mu (torch.Tensor): output of encoder
                shape: (batch_size, n_feats, mel_timesteps)
            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
                shape: (batch_size, spk_emb_dim)

        Returns:
            loss: conditional flow matching loss
            y: conditional flow
                shape: (batch_size, n_feats, mel_timesteps)
        """
        b, _, t = mu.shape

        # random timestep
        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
        if self.t_scheduler == 'cosine':
            t = 1 - torch.cos(t * 0.5 * torch.pi)
        # sample noise p(x_0)
        z = torch.randn_like(x1)

        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
        u = x1 - (1 - self.sigma_min) * z

        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
        if self.training_cfg_rate > 0:
            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
            mu = mu * cfg_mask.view(-1, 1, 1)
            spks = spks * cfg_mask.view(-1, 1)
            cond = cond * cfg_mask.view(-1, 1, 1)

        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
        return loss, y


================================================
FILE: cosyvoice/flow/length_regulator.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Tuple
import torch.nn as nn
from torch.nn import functional as F
from cosyvoice.utils.mask import make_pad_mask


class InterpolateRegulator(nn.Module):
    def __init__(
            self,
            channels: int,
            sampling_ratios: Tuple,
            out_channels: int = None,
            groups: int = 1,
    ):
        super().__init__()
        self.sampling_ratios = sampling_ratios
        out_channels = out_channels or channels
        model = nn.ModuleList([])
        if len(sampling_ratios) > 0:
            for _ in sampling_ratios:
                module = nn.Conv1d(channels, channels, 3, 1, 1)
                norm = nn.GroupNorm(groups, channels)
                act = nn.Mish()
                model.extend([module, norm, act])
        model.append(
            nn.Conv1d(channels, out_channels, 1, 1)
        )
        self.model = nn.Sequential(*model)

    def forward(self, x, ylens=None):
        # x in (B, T, D)
        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='nearest')
        out = self.model(x).transpose(1, 2).contiguous()
        olens = ylens
        return out * mask, olens


================================================
FILE: cosyvoice/hifigan/f0_predictor.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm


class ConvRNNF0Predictor(nn.Module):
    def __init__(self,
                 num_class: int = 1,
                 in_channels: int = 80,
                 cond_channels: int = 512
                 ):
        super().__init__()

        self.num_class = num_class
        self.condnet = nn.Sequential(
            weight_norm(
                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
            ),
            nn.ELU(),
            weight_norm(
                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
            ),
            nn.ELU(),
            weight_norm(
                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
            ),
            nn.ELU(),
            weight_norm(
                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
            ),
            nn.ELU(),
            weight_norm(
                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
            ),
            nn.ELU(),
        )
        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.condnet(x)
        x = x.transpose(1, 2)
        return torch.abs(self.classifier(x).squeeze(-1))


================================================
FILE: cosyvoice/hifigan/generator.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""HIFI-GAN"""

import typing as tp
import numpy as np
from scipy.signal import get_window
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Conv1d
from torch.nn import ConvTranspose1d
from torch.nn.utils import remove_weight_norm
from torch.nn.utils import weight_norm
from torch.distributions.uniform import Uniform

from cosyvoice.transformer.activation import Snake
from cosyvoice.utils.common import get_padding
from cosyvoice.utils.common import init_weights


"""hifigan based generator implementation.

This code is modified from https://github.com/jik876/hifi-gan
 ,https://github.com/kan-bayashi/ParallelWaveGAN and
 https://github.com/NVIDIA/BigVGAN

"""
class ResBlock(torch.nn.Module):
    """Residual block module in HiFiGAN/BigVGAN."""
    def __init__(
        self,
        channels: int = 512,
        kernel_size: int = 3,
        dilations: tp.List[int] = [1, 3, 5],
    ):
        super(ResBlock, self).__init__()
        self.convs1 = nn.ModuleList()
        self.convs2 = nn.ModuleList()

        for dilation in dilations:
            self.convs1.append(
                weight_norm(
                    Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=dilation,
                        padding=get_padding(kernel_size, dilation)
                    )
                )
            )
            self.convs2.append(
                weight_norm(
                    Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        1,
                        dilation=1,
                        padding=get_padding(kernel_size, 1)
                    )
                )
            )
        self.convs1.apply(init_weights)
        self.convs2.apply(init_weights)
        self.activations1 = nn.ModuleList([
            Snake(channels, alpha_logscale=False)
            for _ in range(len(self.convs1))
        ])
        self.activations2 = nn.ModuleList([
            Snake(channels, alpha_logscale=False)
            for _ in range(len(self.convs2))
        ])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for idx in range(len(self.convs1)):
            xt = self.activations1[idx](x)
            xt = self.convs1[idx](xt)
            xt = self.activations2[idx](xt)
            xt = self.convs2[idx](xt)
            x = xt + x
        return x

    def remove_weight_norm(self):
        for idx in range(len(self.convs1)):
            remove_weight_norm(self.convs1[idx])
            remove_weight_norm(self.convs2[idx])

class SineGen(torch.nn.Module):
    """ Definition of sine generator
    SineGen(samp_rate, harmonic_num = 0,
            sine_amp = 0.1, noise_std = 0.003,
            voiced_threshold = 0,
            flag_for_pulse=False)
    samp_rate: sampling rate in Hz
    harmonic_num: number of harmonic overtones (default 0)
    sine_amp: amplitude of sine-wavefrom (default 0.1)
    noise_std: std of Gaussian noise (default 0.003)
    voiced_thoreshold: F0 threshold for U/V classification (default 0)
    flag_for_pulse: this SinGen is used inside PulseGen (default False)
    Note: when flag_for_pulse is True, the first time step of a voiced
        segment is always sin(np.pi) or cos(0)
    """

    def __init__(self, samp_rate, harmonic_num=0,
                 sine_amp=0.1, noise_std=0.003,
                 voiced_threshold=0):
        super(SineGen, self).__init__()
        self.sine_amp = sine_amp
        self.noise_std = noise_std
        self.harmonic_num = harmonic_num
        self.sampling_rate = samp_rate
        self.voiced_threshold = voiced_threshold

    def _f02uv(self, f0):
        # generate uv signal
        uv = (f0 > self.voiced_threshold).type(torch.float32)
        return uv

    @torch.no_grad()
    def forward(self, f0):
        """
        :param f0: [B, 1, sample_len], Hz
        :return: [B, 1, sample_len]
        """

        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
        for i in range(self.harmonic_num + 1):
            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate

        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
        u_dist = Uniform(low=-np.pi, high=np.pi)
        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
        phase_vec[:, 0, :] = 0

        # generate sine waveforms
        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)

        # generate uv signal
        uv = self._f02uv(f0)

        # noise: for unvoiced should be similar to sine_amp
        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
        # .       for voiced regions is self.noise_std
        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
        noise = noise_amp * torch.randn_like(sine_waves)

        # first: set the unvoiced part to 0 by uv
        # then: additive noise
        sine_waves = sine_waves * uv + noise
        return sine_waves, uv, noise


class SourceModuleHnNSF(torch.nn.Module):
    """ SourceModule for hn-nsf
    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
                 add_noise_std=0.003, voiced_threshod=0)
    sampling_rate: sampling_rate in Hz
    harmonic_num: number of harmonic above F0 (default: 0)
    sine_amp: amplitude of sine source signal (default: 0.1)
    add_noise_std: std of additive Gaussian noise (default: 0.003)
        note that amplitude of noise in unvoiced is decided
        by sine_amp
    voiced_threshold: threhold to set U/V given F0 (default: 0)
    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
    F0_sampled (batchsize, length, 1)
    Sine_source (batchsize, length, 1)
    noise_source (batchsize, length 1)
    uv (batchsize, length, 1)
    """

    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
                 add_noise_std=0.003, voiced_threshod=0):
        super(SourceModuleHnNSF, self).__init__()

        self.sine_amp = sine_amp
        self.noise_std = add_noise_std

        # to produce sine waveforms
        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
                                 sine_amp, add_noise_std, voiced_threshod)

        # to merge source harmonics into a single excitation
        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
        self.l_tanh = torch.nn.Tanh()

    def forward(self, x):
        """
        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
        F0_sampled (batchsize, length, 1)
        Sine_source (batchsize, length, 1)
        noise_source (batchsize, length 1)
        """
        # source for harmonic branch
        with torch.no_grad():
            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
            sine_wavs = sine_wavs.transpose(1, 2)
            uv = uv.transpose(1, 2)
        sine_merge = self.l_tanh(self.l_linear(sine_wavs))

        # source for noise branch, in the same shape as uv
        noise = torch.randn_like(uv) * self.sine_amp / 3
        return sine_merge, noise, uv


class HiFTGenerator(nn.Module):
    """
    HiFTNet Generator: Neural Source Filter + ISTFTNet
    https://arxiv.org/abs/2309.09493
    """
    def __init__(
            self,
            in_channels: int = 80,
            base_channels: int = 512,
            nb_harmonics: int = 8,
            sampling_rate: int = 22050,
            nsf_alpha: float = 0.1,
            nsf_sigma: float = 0.003,
            nsf_voiced_threshold: float = 10,
            upsample_rates: tp.List[int] = [8, 8],
            upsample_kernel_sizes: tp.List[int] = [16, 16],
            istft_params: tp.Dict[str, int] = {"n_fft": 16, "hop_len": 4},
            resblock_kernel_sizes: tp.List[int] = [3, 7, 11],
            resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
            source_resblock_kernel_sizes: tp.List[int] = [7, 11],
            source_resblock_dilation_sizes: tp.List[tp.List[int]] = [[1, 3, 5], [1, 3, 5]],
            lrelu_slope: float = 0.1,
            audio_limit: float = 0.99,
            f0_predictor: torch.nn.Module = None,
    ):
        super(HiFTGenerator, self).__init__()

        self.out_channels = 1
        self.nb_harmonics = nb_harmonics
        self.sampling_rate = sampling_rate
        self.istft_params = istft_params
        self.lrelu_slope = lrelu_slope
        self.audio_limit = audio_limit

        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
        self.m_source = SourceModuleHnNSF(
            sampling_rate=sampling_rate,
            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
            harmonic_num=nb_harmonics,
            sine_amp=nsf_alpha,
            add_noise_std=nsf_sigma,
            voiced_threshod=nsf_voiced_threshold)
        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])

        self.conv_pre = weight_norm(
            Conv1d(in_channels, base_channels, 7, 1, padding=3)
        )

        # Up
        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
            self.ups.append(
                weight_norm(
                    ConvTranspose1d(
                        base_channels // (2**i),
                        base_channels // (2**(i + 1)),
                        k,
                        u,
                        padding=(k - u) // 2,
                    )
                )
            )

        # Down
        self.source_downs = nn.ModuleList()
        self.source_resblocks = nn.ModuleList()
        downsample_rates = [1] + upsample_rates[::-1][:-1]
        downsample_cum_rates = np.cumprod(downsample_rates)
        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes,
                                          source_resblock_dilation_sizes)):
            if u == 1:
                self.source_downs.append(
                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
                )
            else:
                self.source_downs.append(
                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
                )

            self.source_resblocks.append(
                ResBlock(base_channels // (2 ** (i + 1)), k, d)
            )

        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = base_channels // (2**(i + 1))
            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(ResBlock(ch, k, d))

        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
        self.ups.apply(init_weights)
        self.conv_post.apply(init_weights)
        self.reflection_pad = nn.ReflectionPad1d((1, 0))
        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
        self.f0_predictor = f0_predictor

    def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
        f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t

        har_source, _, _ = self.m_source(f0)
        return har_source.transpose(1, 2)

    def _stft(self, x):
        spec = torch.stft(
            x,
            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
            return_complex=True)
        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
        return spec[..., 0], spec[..., 1]

    def _istft(self, magnitude, phase):
        magnitude = torch.clip(magnitude, max=1e2)
        real = magnitude * torch.cos(phase)
        img = magnitude * torch.sin(phase)
        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
        return inverse_transform

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        f0 = self.f0_predictor(x)
        s = self._f02source(f0)

        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)

        x = self.conv_pre(x)
        for i in range(self.num_upsamples):
            x = F.leaky_relu(x, self.lrelu_slope)
            x = self.ups[i](x)

            if i == self.num_upsamples - 1:
                x = self.reflection_pad(x)

            # fusion
            si = self.source_downs[i](s_stft)
            si = self.source_resblocks[i](si)
            x = x + si

            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i * self.num_kernels + j](x)
                else:
                    xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels

        x = F.leaky_relu(x)
        x = self.conv_post(x)
        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy

        x = self._istft(magnitude, phase)
        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
        return x

    def remove_weight_norm(self):
        print('Removing weight norm...')
        for l in self.ups:
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
        remove_weight_norm(self.conv_pre)
        remove_weight_norm(self.conv_post)
        self.source_module.remove_weight_norm()
        for l in self.source_downs:
            remove_weight_norm(l)
        for l in self.source_resblocks:
            l.remove_weight_norm()

    @torch.inference_mode()
    def inference(self, mel: torch.Tensor) -> torch.Tensor:
        return self.forward(x=mel)


================================================
FILE: cosyvoice/llm/llm.py
================================================
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Optional, Union
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, unpad_sequence
from cosyvoice.utils.common import IGNORE_ID
from cosyvoice.transformer.label_smoothing_loss import LabelSmoothingLoss
from cosyvoice.utils.common import th_accuracy


class TransformerLM(torch.nn.Module):
    def __init__(
            self,
            text_encoder_input_size: int,
            llm_input_size: int,
            llm_output_size: int,
            text_token_size: int,
            speech_token_size: int,
            text_encoder: torch.nn.Module,
            llm: torch.nn.Module,
            length_normalized_loss: bool = True,
            lsm_weight: float = 0.0,
            spk_embed_dim: int = 192,
    ):
        super().__init__()
        self.llm_input_size = llm_input_size
        self.speech_token_size = speech_token_size
        # 1. build text token inputs related modules
        self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
        self.text_encoder = text_encoder
        self.text_encoder_affine_layer = nn.Linear(
            self.text_encoder.output_size(),
            llm_input_size
        )

        # 2. build speech token language model related modules
        self.sos_eos = 0
        self.task_id = 1
        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
        self.llm = llm
        self.llm_decoder = nn.Linear(llm_output_size, speech_token_size + 1)
        self.criterion_ce = LabelSmoothingLoss(
            size=speech_token_size + 1,
            padding_idx=IGNORE_ID,
            smoothing=lsm_weight,
            normalize_length=length_normalized_loss,
        )

        # 3. [Optional] build speech token related modules
        self.speech_embedding = torch.nn.Embedding(speech_token_size, llm_input_size)
        self.spk_embed_affine_layer = torch.nn.Linear(spk_embed_dim, llm_input_size)

    def encode(
            self,
            text: torch.Tensor,
            text_lengths: torch.Tensor,
    ):
        encoder_out, encoder_mask = self.text_encoder(text, text_lengths, decoding_chunk_size=1, num_decoding_left_chunks=-1)
        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        encoder_out = self.text_encoder_affine_layer(encoder_out)
        return encoder_out, encoder_out_lens

    def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len):
        text_token = unpad_sequence(text_token, text_token_len.cpu(), batch_first=True)
        speech_token = unpad_sequence(speech_token, speech_token_len.cpu(), batch_first=True)
        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0), embedding[i], text_token[i], task_id_emb.squeeze(dim=0), speech_token[i]], dim=0) for i in range(len(text_token))]
        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
        return lm_input, lm_input_len

    def forward(
            self,
            batch: dict,
            device: torch.device,
    ) -> Dict[str, Optional[torch.Tensor]]:
        """
        Args:
            text: (B, L, D)
            text_lengths: (B,)
            audio: (B, T, N) or (B, T)
            audio_lengths: (B,)
        """
        text_token = batch['text_token'].to(device)
        text_token_len = batch['text_token_len'].to(device)
        speech_token = batch['speech_token'].to(device)
        speech_token_len = batch['speech_token_len'].to(device)
        embedding = batch['embedding'].to(device)

        # 1. prepare llm_target
        lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)

        # 1. encode text_token
        text_token = self.text_embedding(text_token)
        text_token, text_token_len = self.encode(text_token, text_token_len)

        # 2. embedding projection
        embedding = F.normalize(embedding, dim=1)
        embedding = self.spk_embed_affine_layer(embedding)
        embedding = embedding.unsqueeze(1)

        # 3. eos and task_id
        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)

        # 4. encode speech_token
        speech_token = self.speech_embedding(speech_token)

        # 5. unpad and pad
        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb, embedding, text_token, text_token_len, task_id_emb, speech_token, speech_token_len)

        # 6. run lm forward
        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
        logits = self.llm_decoder(lm_output)
        loss = self.criterion_ce(logits, lm_target)
        acc = th_accuracy(logits.view(-1, self.speech_token_size + 1), lm_target, ignore_label=IGNORE_ID)
        return {'loss': loss, 'acc': acc}

    def sampling_ids(
            self,
            weighted_scores: torch.Tensor,
            sampling: Union[bool, int, float] = True,
            beam_size: int = 1,
            ignore_eos: bool = True,
    ):
        while True:
            prob, indices = weighted_scores.softmax(dim=-1).topk(sampling)
            top_ids = prob.multinomial(beam_size, replacement=True)
            top_ids = indices[top_ids]
            if (not ignore_eos) or (self.speech_token_size not in top_ids):
                break
        return top_ids

    @torch.inference_mode()
    def inference(
            self,
            text: torch.Tensor,
            text_len: torch.Tensor,
            prompt_text: torch.Tensor,
            prompt_text_len: torch.Tensor,
            prompt_speech_token: torch.Tensor,
            prompt_speech_token_len: torch.Tensor,
            embedding: torch.Tensor,
            beam_size: int = 1,
            sampling: int = 25,
            max_token_text_ratio: float = 20,
            min_token_text_ratio: float = 2,
    ) -> torch.Tensor:
        device = text.device
        text = torch.concat([prompt_text, text], dim=1)
        text_len += prompt_text_len
        text = self.text_embedding(text)

        # 1. encode text
        text, text_len = self.encode(text, text_len)

        # 2. encode embedding
        if embedding.shape[0] != 0:
            embedding = F.normalize(embedding, dim=1)
            embedding = self.spk_embed_affine_layer(embedding)
            embedding = embedding.unsqueeze(dim=1)
        else:
            embedding = torch.zeros(1, 0, self.llm_input_size).to(device)

        # 3. concat llm_input
        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
        if prompt_speech_token_len != 0:
            prompt_speech_token_emb = self.speech_embedding(prompt_speech_token)
        else:
            prompt_speech_token_emb = torch.zeros(1, 0, self.llm_input_size).to(device)
        lm_input = torch.concat([sos_eos_emb, embedding, text, task_id_emb, prompt_speech_token_emb], dim=1)

        # 4. cal min/max_length
        min_len = int((text_len - prompt_text_len) * min_token_text_ratio)
        max_len = int((text_len - prompt_text_len) * max_token_text_ratio)

        # 5. step by step decode
        out_tokens = []
        offset = 0
        att_cache, cnn_cache = torch.zeros((0, 0, 0, 0), device=lm_input.device), torch.zeros((0, 0, 0, 0), device=lm_input.device)
        for i in range(max_len):
            y_pred, att_cache, cnn_cache = self.llm.forward_chunk(lm_input, offset=0, required_cache_size=-1, att_cache=att_cache, cnn_cache=cnn_cache,
                                                                  att_mask=torch.tril(torch.ones((1, lm_input.shape[1], lm_input.shape[1]), device=lm_input.device)).to(torch.bool))
            logp = self.llm_decoder(y_pred[:, -1]).log_softmax(dim=-1)
            top_ids = self.sampling_ids(logp.squeeze(dim=0), sampling, beam_size, ignore_eos=True if i < min_len else False).item()
            if top_ids == self.speech_token_size:
                break
            out_tokens.append(top_ids)
            offset += lm_input.size(1)
            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)

        return torch.tensor([out_tokens], dtype=torch.int64, device=device)


================================================
FILE: cosyvoice/transformer/__init__.py
================================================


================================================
FILE: cosyvoice/transformer/activation.py
================================================
# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
#               2020 Northwestern Polytechnical University (Pengcheng Guo)
#               2020 Mobvoi Inc (Binbin Zhang)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Swish() activation function for Conformer."""

import torch
from torch import nn, sin, pow
from torch.nn import Parameter


class Swish(torch.nn.Module):
    """Construct an Swish object."""

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Return Swish activation function."""
        return x * torch.sigmoid(x)


# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
#   LICENSE is in incl_licenses directory.
class Snake(nn.Module):
    '''
    Implementation of a sine-based periodic activation function
    Shape:
        - Input: (B, C, T)
        - Output: (B, C, T), same shape as the input
    Parameters:
        - alpha - trainable parameter
    References:
        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
        https://arxiv.org/abs/2006.08195
    Examples:
        >>> a1 = snake(256)
        >>> x = torch.randn(256)
        >>> x = a1(x)
    '''
    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
        '''
        Initialization.
        INPUT:
            - in_features: shape of the input
            - alpha: trainable parameter
            alpha is initialized to 1 by default, higher values = higher-frequency.
            alpha will be trained along with the rest of your model.
        '''
        super(Snake, self).__init__()
        self.in_features = in_features

        # initialize alpha
        self.alpha_logscale = alpha_logscale
        if self.alpha_logscale:  # log scale alphas initialized to zeros
            self.alpha = Parameter(torch.zeros(in_features) * alpha)
        else:  # linear scale alphas initialized to ones
            self.alpha = Parameter(torch.ones(in_features) * alpha)

        self.alpha.requires_grad = alpha_trainable

        self.no_div_by_zero = 0.000000001

    def forward(self, x):
        '''
        Forward pass of the function.
        Applies the function to the input elementwise.
        Snake ∶= x + 1/a * sin^2 (xa)
        '''
        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
        if self.alpha_logscale:
            alpha = torch.exp(alpha)
        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)

        return x


================================================
FILE: cosyvoice/transformer/attention.py
================================================
# Copyright (c) 2019 Shigeki Karita
#               2020 Mobvoi Inc (Binbin Zhang)
#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multi-Head Attention layer definition."""

import math
from typing import Tuple

import torch
from torch import nn


class MultiHeadedAttention(nn.Module):
    """Multi-Head Attention layer.

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.

    """

    def __init__(self,
                 n_head: int,
                 n_feat: int,
                 dropout_rate: float,
                 key_bias: bool = True):
        """Construct an MultiHeadedAttention object."""
        super().__init__()
        assert n_feat % n_head == 0
        # We assume d_v always equals d_k
        self.d_k = n_feat // n_head
        self.h = n_head
        self.linear_q = nn.Linear(n_feat, n_feat)
        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
        self.linear_v = nn.Linear(n_feat, n_feat)
        self.linear_out = nn.Linear(n_feat, n_feat)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward_qkv(
        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Transform query, key and value.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).

        Returns:
            torch.Tensor: Transformed query tensor, size
                (#batch, n_head, time1, d_k).
            torch.Tensor: Transformed key tensor, size
                (#batch, n_head, time2, d_k).
            torch.Tensor: Transformed value tensor, size
                (#batch, n_head, time2, d_k).

        """
        n_batch = query.size(0)
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
        v = v.transpose(1, 2)  # (batch, head, time2, d_k)

        return q, k, v

    def forward_attention(
        self,
        value: torch.Tensor,
        scores: torch.Tensor,
        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
    ) -> torch.Tensor:
        """Compute attention context vector.

        Args:
            value (torch.Tensor): Transformed value, size
                (#batch, n_head, time2, d_k).
            scores (torch.Tensor): Attention score, size
                (#batch, n_head, time1, time2).
            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
                (#batch, time1, time2), (0, 0, 0) means fake mask.

        Returns:
            torch.Tensor: Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).

        """
        n_batch = value.size(0)
        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
        #           1st chunk to ease the onnx export.]
        #   2. pytorch training
        if mask.size(2) > 0:  # time2 > 0
            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
            # For last chunk, time2 might be larger than scores.size(-1)
            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
            scores = scores.masked_fill(mask, -float('inf'))
            attn = torch.softmax(scores, dim=-1).masked_fill(
                mask, 0.0)  # (batch, head, time1, time2)
        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
        #   1. onnx(16/-1, -1/-1, 16/0)
        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
        else:
            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)

        p_attn = self.dropout(attn)
        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
                                                 self.h * self.d_k)
             )  # (batch, time1, d_model)

        return self.linear_out(x)  # (batch, time1, d_model)

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
        pos_emb: torch.Tensor = torch.empty(0),
        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).
                1.When applying cross attention between decoder and encoder,
                the batch padding mask for input is in (#batch, 1, T) shape.
                2.When applying self attention of encoder,
                the mask is in (#batch, T, T)  shape.
                3.When applying self attention of decoder,
                the mask is in (#batch, L, L)  shape.
                4.If the different position in decoder see different block
                of the encoder, such as Mocha, the passed in mask could be
                in (#batch, L, T) shape. But there is no such case in current
                CosyVoice.
            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`


        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).
            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`

        """
        q, k, v = self.forward_qkv(query, key, value)

        # NOTE(xcsong):
        #   when export onnx model, for 1st chunk, we feed
        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
        #       and we will always do splitting and
        #       concatnation(this will simplify onnx export). Note that
        #       it's OK to concat & split zero-shaped tensors(see code below).
        #   when export jit  model, for 1st chunk, we always feed
        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
        # >>> a = torch.ones((1, 2, 0, 4))
        # >>> b = torch.ones((1, 2, 3, 4))
        # >>> c = torch.cat((a, b), dim=2)
        # >>> torch.equal(b, c)        # True
        # >>> d = torch.split(a, 2, dim=-1)
        # >>> torch.equal(d[0], d[1])  # True
        if cache.size(0) > 0:
            key_cache, value_cache = torch.split(cache,
                                                 cache.size(-1) // 2,
                                                 dim=-1)
            k = torch.cat([key_cache, k], dim=2)
            v = torch.cat([value_cache, v], dim=2)
        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
        #   non-trivial to calculate `next_cache_start` here.
        new_cache = torch.cat((k, v), dim=-1)

        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        return self.forward_attention(v, scores, mask), new_cache


class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    """Multi-Head Attention layer with relative position encoding.
    Paper: https://arxiv.org/abs/1901.02860
    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.
    """

    def __init__(self,
                 n_head: int,
                 n_feat: int,
                 dropout_rate: float,
                 key_bias: bool = True):
        """Construct an RelPositionMultiHeadedAttention object."""
        super().__init__(n_head, n_feat, dropout_rate, key_bias)
        # linear transformation for positional encoding
        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
        # these two learnable bias are used in matrix c and matrix d
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
        torch.nn.init.xavier_uniform_(self.pos_bias_u)
        torch.nn.init.xavier_uniform_(self.pos_bias_v)

    def rel_shift(self, x):
        """Compute relative positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
            time1 means the length of query vector.

        Returns:
            torch.Tensor: Output tensor.

        """
        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
        x_padded = torch.cat([zero_pad, x], dim=-1)

        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
        x = x_padded[:, :, 1:].view_as(x)[
            :, :, :, : x.size(-1) // 2 + 1
        ]  # only keep the positions from 0 to time2
        return x

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
        pos_emb: torch.Tensor = torch.empty(0),
        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2), (0, 0, 0) means fake mask.
            pos_emb (torch.Tensor): Positional embedding tensor
                (#batch, time2, size).
            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`
        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).
            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
                where `cache_t == chunk_size * num_decoding_left_chunks`
                and `head * d_k == size`
        """
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose(1, 2)  # (batch, time1, head, d_k)

        # NOTE(xcsong):
        #   when export onnx model, for 1st chunk, we feed
        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
        #       and we will always do splitting and
        #       concatnation(this will simplify onnx export). Note that
        #       it's OK to concat & split zero-shaped tensors(see code below).
        #   when export jit  model, for 1st chunk, we always feed
        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
        # >>> a = torch.ones((1, 2, 0, 4))
        # >>> b = torch.ones((1, 2, 3, 4))
        # >>> c = torch.cat((a, b), dim=2)
        # >>> torch.equal(b, c)        # True
        # >>> d = torch.split(a, 2, dim=-1)
        # >>> torch.equal(d[0], d[1])  # True
        if cache.size(0) > 0:
            key_cache, value_cache = torch.split(cache,
                                                 cache.size(-1) // 2,
                                                 dim=-1)
            k = torch.cat([key_cache, k], dim=2)
            v = torch.cat([value_cache, v], dim=2)
        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
        #   non-trivial to calculate `next_cache_start` here.
        new_cache = torch.cat((k, v), dim=-1)

        n_batch_pos = pos_emb.size(0)
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose(1, 2)  # (batch, head, time1, d_k)

        # (batch, head, time1, d_k)
        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
        # (batch, head, time1, d_k)
        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)

        # compute attention score
        # first compute matrix a and matrix c
        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
        # (batch, head, time1, time2)
        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))

        # compute matrix b and matrix d
        # (batch, head, time1, time2)
        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
        if matrix_ac.shape != matrix_bd.shape:
            matrix_bd = self.rel_shift(matrix_bd)

        scores = (matrix_ac + matrix_bd) / math.sqrt(
            self.d_k)  # (batch, head, time1, time2)

        return self.forward_attention(v, scores, mask), new_cache


================================================
FILE: cosyvoice/transformer/convolution.py
================================================
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""ConvolutionModule definition."""

from typing import Tuple

import torch
from torch import nn


class ConvolutionModule(nn.Module):
    """ConvolutionModule in Conformer model."""

    def __init__(self,
                 channels: int,
                 kernel_size: int = 15,
                 activation: nn.Module = nn.ReLU(),
                 norm: str = "batch_norm",
                 causal: bool = False,
                 bias: bool = True):
        """Construct an ConvolutionModule object.
        Args:
            channels (int): The number of channels of conv layers.
            kernel_size (int): Kernel size of conv layers.
            causal (int): Whether use causal convolution or not
        """
        super().__init__()

        self.pointwise_conv1 = nn.Conv1d(
            channels,
            2 * channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        # self.lorder is used to distinguish if it's a causal convolution,
        # if self.lorder > 0: it's a causal convolution, the input will be
        #    padded with self.lorder frames on the left in forward.
        # else: it's a symmetrical convolution
        if causal:
            padding = 0
            self.lorder = kernel_size - 1
        else:
            # kernel_size should be an odd number for none causal convolution
            assert (kernel_size - 1) % 2 == 0
            padding = (kernel_size - 1) // 2
            self.lorder = 0
        self.depthwise_conv = nn.Conv1d(
            channels,
            channels,
            kernel_size,
            stride=1,
            padding=padding,
            groups=channels,
            bias=bias,
        )

        assert norm in ['batch_norm', 'layer_norm']
        if norm == "batch_norm":
            self.use_layer_norm = False
            self.norm = nn.BatchNorm1d(channels)
        else:
            self.use_layer_norm = True
            self.norm = nn.LayerNorm(channels)

        self.pointwise_conv2 = nn.Conv1d(
            channels,
            channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.activation = activation

    def forward(
        self,
        x: torch.Tensor,
        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
        cache: torch.Tensor = torch.zeros((0, 0, 0)),
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute convolution module.
        Args:
            x (torch.Tensor): Input tensor (#batch, time, channels).
            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
                (0, 0, 0) means fake mask.
            cache (torch.Tensor): left context cache, it is only
                used in causal convolution (#batch, channels, cache_t),
                (0, 0, 0) meas fake cache.
        Returns:
            torch.Tensor: Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose(1, 2)  # (#batch, channels, time)

        # mask batch padding
        if mask_pad.size(2) > 0:  # time > 0
            x.masked_fill_(~mask_pad, 0.0)

        if self.lorder > 0:
            if cache.size(2) == 0:  # cache_t == 0
                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
            else:
                assert cache.size(0) == x.size(0)  # equal batch
                assert cache.size(1) == x.size(1)  # equal channel
                x = torch.cat((cache, x), dim=2)
            assert (x.size(2) > self.lorder)
            new_cache = x[:, :, -self.lorder:]
        else:
            # It's better we just return None if no cache is required,
            # However, for JIT export, here we just fake one tensor instead of
            # None.
            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)

        # GLU mechanism
        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)

        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
        if self.use_layer_norm:
            x = x.transpose(1, 2)
        x = self.activation(self.norm(x))
        if self.use_layer_norm:
            x = x.transpose(1, 2)
        x = self.pointwise_conv2(x)
        # mask batch padding
        if mask_pad.size(2) > 0:  # time > 0
            x.masked_fill_(~mask_pad, 0.0)

        return x.transpose(1, 2), new_cache


================================================
FILE: cosyvoice/transformer/decoder.py
================================================
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Decoder definition."""
from typing import Tuple, List, Optional

import torch
import torch.utils.checkpoint as ckpt
import logging

from cosyvoice.transformer.decoder_layer import DecoderLayer
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
from cosyvoice.utils.class_utils import (
    COSYVOICE_EMB_CLASSES,
    COSYVOICE_ATTENTION_CLASSES,
    COSYVOICE_ACTIVATION_CLASSES,
)
from cosyvoice.utils.mask import (subsequent_mask, make_pad_mask)


class TransformerDecoder(torch.nn.Module):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the hidden units number of position-wise feedforward
        num_blocks: the number of decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before:
            True: use layer_norm before each sub-block of a layer.
            False: use layer_norm after each sub-block of a layer.
        src_attention: if false, encoder-decoder cross attention is not
                       applied, such as CIF model
        key_bias: whether use bias in attention.linear_k, False for whisper models.
        gradient_checkpointing: rerunning a forward-pass segment for each
            checkpointed segment during backward.
        tie_word_embedding: Tie or clone module weights depending of whether we are
            using TorchScript or not
    """

    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        normalize_before: bool = True,
        src_attention: bool = True,
        key_bias: bool = True,
        activation_type: str = "relu",
        gradient_checkpointing: bool = False,
        tie_word_embedding: bool = False,
    ):
        super().__init__()
        attention_dim = encoder_output_size
        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()

        self.embed = torch.nn.Sequential(
            torch.nn.Identity() if input_layer == "no_pos" else
            torch.nn.Embedding(vocab_size, attention_dim),
            COSYVOICE_EMB_CLASSES[input_layer](attention_dim,
                                               positional_dropout_rate),
        )

        self.normalize_before = normalize_before
        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
        self.use_output_layer = use_output_layer
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = torch.nn.Identity()
        self.num_blocks = num_blocks
        self.decoders = torch.nn.ModuleList([
            DecoderLayer(
                attention_dim,
                COSYVOICE_ATTENTION_CLASSES["selfattn"](
                    attention_heads, attention_dim,
                    self_attention_dropout_rate, key_bias),
                COSYVOICE_ATTENTION_CLASSES["selfattn"](
                    attention_heads, attention_dim, src_attention_dropout_rate,
                    key_bias) if src_attention else None,
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate, activation),
                dropout_rate,
                normalize_before,
            ) for _ in range(self.num_blocks)
        ])

        self.gradient_checkpointing = gradient_checkpointing
        self.tie_word_embedding = tie_word_embedding

    def forward(
        self,
        memory: torch.Tensor,
        memory_mask: torch.Tensor,
        ys_in_pad: torch.Tensor,
        ys_in_lens: torch.Tensor,
        r_ys_in_pad: torch.Tensor = torch.empty(0),
        reverse_weight: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Forward decoder.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
            ys_in_lens: input lengths of this batch (batch)
            r_ys_in_pad: not used in transformer decoder, in order to unify api
                with bidirectional decoder
            reverse_weight: not used in transformer decoder, in order to unify
                api with bidirectional decode
        Returns:
            (tuple): tuple containing:
                x: decoded token score before softmax (batch, maxlen_out,
                    vocab_size) if use_output_layer is True,
                torch.tensor(0.0), in order to unify api with bidirectional decoder
                olens: (batch, )
        NOTE(xcsong):
            We pass the `__call__` method of the modules instead of `forward` to the
            checkpointing API because `__call__` attaches all the hooks of the module.
            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
        """
        tgt = ys_in_pad
        maxlen = tgt.size(1)
        # tgt_mask: (B, 1, L)
        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
        tgt_mask = tgt_mask.to(tgt.device)
        # m: (1, L, L)
        m = subsequent_mask(tgt_mask.size(-1),
                            device=tgt_mask.device).unsqueeze(0)
        # tgt_mask: (B, L, L)
        tgt_mask = tgt_mask & m
        x, _ = self.embed(tgt)
        if self.gradient_checkpointing and self.training:
            x = self.forward_layers_checkpointed(x, tgt_mask, memory,
                                                 memory_mask)
        else:
            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
        if self.normalize_before:
            x = self.after_norm(x)
        if self.use_output_layer:
            x = self.output_layer(x)
        olens = tgt_mask.sum(1)
        return x, torch.tensor(0.0), olens

    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
                       memory: torch.Tensor,
                       memory_mask: torch.Tensor) -> torch.Tensor:
        for layer in self.decoders:
            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
                                                     memory_mask)
        return x

    @torch.jit.ignore(drop=True)
    def forward_layers_checkpointed(self, x: torch.Tensor,
                                    tgt_mask: torch.Tensor,
                                    memory: torch.Tensor,
                                    memory_mask: torch.Tensor) -> torch.Tensor:
        for layer in self.decoders:
            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
                layer.__call__, x, tgt_mask, memory, memory_mask)
        return x

    def forward_one_step(
        self,
        memory: torch.Tensor,
        memory_mask: torch.Tensor,
        tgt: torch.Tensor,
        tgt_mask: torch.Tensor,
        cache: Optional[List[torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        """Forward one step.
            This is only used for decoding.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        """
        x, _ = self.embed(tgt)
        new_cache = []
        for i, decoder in enumerate(self.decoders):
            if cache is None:
                c = None
            else:
                c = cache[i]
            x, tgt_mask, memory, memory_mask = decoder(x,
                                                       tgt_mask,
                                                       memory,
                                                       memory_mask,
                                                       cache=c)
            new_cache.append(x)
        if self.normalize_before:
            y = self.after_norm(x[:, -1])
        else:
            y = x[:, -1]
        if self.use_output_layer:
            y = torch.log_softmax(self.output_layer(y), dim=-1)
        return y, new_cache

    def tie_or_clone_weights(self, jit_mode: bool = True):
        """Tie or clone module weights (between word_emb and output_layer)
            depending of whether we are using TorchScript or not"""
        if not self.use_output_layer:
            return
        if jit_mode:
            logging.info("clone emb.weight to output.weight")
            self.output_layer.weight = torch.nn.Parameter(
                self.embed[0].weight.clone())
        else:
            logging.info("tie emb.weight with output.weight")
            self.output_layer.weight = self.embed[0].weight

        if getattr(self.output_layer, "bias", None) is not None:
            self.output_layer.bias.data = torch.nn.functional.pad(
                self.output_layer.bias.data,
                (
                    0,
                    self.output_layer.weight.shape[0] -
                    self.output_layer.bias.shape[0],
                ),
                "constant",
                0,
            )


class BiTransformerDecoder(torch.nn.Module):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
        encoder_output_size: dimension of attention
        attention_heads: the number of heads of multi head attention
        linear_units: the hidden units number of position-wise feedforward
        num_blocks: the number of decoder blocks
        r_num_blocks: the number of right to left decoder blocks
        dropout_rate: dropout rate
        self_attention_dropout_rate: dropout rate for attention
        input_layer: input layer type
        use_output_layer: whether to use output layer
        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
        normalize_before:
            True: use layer_norm before each sub-block of a layer.
            False: use layer_norm after each sub-block of a layer.
        key_bias: whether use bias in attention.linear_k, False for whisper models.
    """

    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        r_num_blocks: int = 0,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        normalize_before: bool = True,
        key_bias: bool = True,
        gradient_checkpointing: bool = False,
        tie_word_embedding: bool = False,
    ):

        super().__init__()
        self.tie_word_embedding = tie_word_embedding
        self.left_decoder = TransformerDecoder(
            vocab_size,
            encoder_output_size,
            attention_heads,
            linear_units,
            num_blocks,
            dropout_rate,
            positional_dropout_rate,
            self_attention_dropout_rate,
            src_attention_dropout_rate,
            input_layer,
            use_output_layer,
            normalize_before,
            key_bias=key_bias,
            gradient_checkpointing=gradient_checkpointing,
            tie_word_embedding=tie_word_embedding)

        self.right_decoder = TransformerDecoder(
            vocab_size,
            encoder_output_size,
            attention_heads,
            linear_units,
            r_num_blocks,
            dropout_rate,
            positional_dropout_rate,
            self_attention_dropout_rate,
            src_attention_dropout_rate,
            input_layer,
            use_output_layer,
            normalize_before,
            key_bias=key_bias,
            gradient_checkpointing=gradient_checkpointing,
            tie_word_embedding=tie_word_embedding)

    def forward(
        self,
        memory: torch.Tensor,
        memory_mask: torch.Tensor,
        ys_in_pad: torch.Tensor,
        ys_in_lens: torch.Tensor,
        r_ys_in_pad: torch.Tensor,
        reverse_weight: float = 0.0,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Forward decoder.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
            ys_in_lens: input lengths of this batch (batch)
            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
                used for right to left decoder
            reverse_weight: used for right to left decoder
        Returns:
            (tuple): tuple containing:
                x: decoded token score before softmax (batch, maxlen_out,
                    vocab_size) if use_output_layer is True,
                r_x: x: decoded token score (right to left decoder)
                    before softmax (batch, maxlen_out, vocab_size)
                    if use_output_layer is True,
                olens: (batch, )
        """
        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
                                          ys_in_lens)
        r_x = torch.tensor(0.0)
        if reverse_weight > 0.0:
            r_x, _, olens = self.right_decoder(memory, memory_mask,
                                               r_ys_in_pad, ys_in_lens)
        return l_x, r_x, olens

    def forward_one_step(
        self,
        memory: torch.Tensor,
        memory_mask: torch.Tensor,
        tgt: torch.Tensor,
        tgt_mask: torch.Tensor,
        cache: Optional[List[torch.Tensor]] = None,
    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        """Forward one step.
            This is only used for decoding.
        Args:
            memory: encoded memory, float32  (batch, maxlen_in, feat)
            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
            tgt: input token ids, int64 (batch, maxlen_out)
            tgt_mask: input token mask,  (batch, maxlen_out)
                      dtype=torch.uint8 in PyTorch 1.2-
                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
            cache: cached output list of (batch, max_time_out-1, size)
        Returns:
            y, cache: NN output value and cache per `self.decoders`.
            y.shape` is (batch, maxlen_out, token)
        """
        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
                                                  tgt_mask, cache)

    def tie_or_clone_weights(self, jit_mode: bool = True):
        """Tie or clone module weights (between word_emb and output_layer)
            depending of whether we are using TorchScript or not"""
        self.left_decoder.tie_or_clone_weights(jit_mode)
        self.right_decoder.tie_or_clone_weights(jit_mode)


================================================
FILE: cosyvoice/transformer/decoder_layer.py
================================================
# Copyright (c) 2019 Shigeki Karita
#               2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
from typing import Optional, Tuple

import torch
from torch import nn


class DecoderLayer(nn.Module):
    """Single decoder layer module.

    Args:
        size (int): Input dimension.
        self_attn (torch.nn.Module): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
        src_attn (torch.nn.Module): Inter-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
            If `None` is passed, Inter-attention is not used, such as
            CIF, GPT, and other decoder only model.
        feed_forward (torch.nn.Module): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
            True: use layer_norm before each sub-block.
            False: to use layer_norm after each sub-block.
    """

    def __init__(
        self,
        size: int,
        self_attn: nn.Module,
        src_attn: Optional[nn.Module],
        feed_forward: nn.Module,
        dropout_rate: float,
        normalize_before: bool = True,
    ):
        """Construct an DecoderLayer object."""
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.norm1 = nn.LayerNorm(size, eps=1e-5)
        self.norm2 = nn.LayerNorm(size, eps=1e-5)
        self.norm3 = nn.LayerNorm(size, eps=1e-5)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before

    def forward(
        self,
        tgt: torch.Tensor,
        tgt_mask: torch.Tensor,
        memory: torch.Tensor,
        memory_mask: torch.Tensor,
        cache: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor
                (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory
                (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask
                (#batch, maxlen_in).
            cache (torch.Tensor): cached tensors.
                (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        """
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)

        if cache is None:
            tgt_q = tgt
            tgt_q_mask = tgt_mask
        else:
            # compute only the last frame query keeping dim: max_time_out -> 1
            assert cache.shape == (
                tgt.shape[0],
                tgt.shape[1] - 1,
                self.size,
            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
            tgt_q = tgt[:, -1:, :]
            residual = residual[:, -1:, :]
            tgt_q_mask = tgt_mask[:, -1:, :]

        x = residual + self.dropout(
            self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
        if not self.normalize_before:
            x = self.norm1(x)

        if self.src_attn is not None:
            residual = x
            if self.normalize_before:
                x = self.norm2(x)
            x = residual + self.dropout(
                self.src_attn(x, memory, memory, memory_mask)[0])
            if not self.normalize_before:
                x = self.norm2(x)

        residual = x
        if self.normalize_before:
            x = self.norm3(x)
        x = residual + self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm3(x)

        if cache is not None:
            x = torch.cat([cache, x], dim=1)

        return x, tgt_mask, memory, memory_mask


================================================
FILE: cosyvoice/transformer/embedding.py
================================================
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Positonal Encoding Module."""

import math
from typing import Tuple, Union

import torch
import torch.nn.functional as F
import numpy as np


class PositionalEncoding(torch.nn.Module):
    """Positional encoding.

    :param int d_model: embedding dim
    :param float dropout_rate: dropout rate
    :param int max_len: maximum input length

    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
    """

    def __init__(self,
                 d_model: int,
                 dropout_rate: float,
                 max_len: int = 5000,
                 reverse: bool = False):
        """Construct an PositionalEncoding object."""
        super().__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = torch.nn.Dropout(p=dropout_rate)
        self.max_len = max_len

        self.pe = torch.zeros(self.max_len, self.d_model)
        position = torch.arange(0, self.max_len,
                                dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
            -(math.log(10000.0) / self.d_model))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)

    def forward(self,
                x: torch.Tensor,
                offset: Union[int, torch.Tensor] = 0) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input. Its shape is (batch, time, ...)
            offset (int, torch.tensor): position offset

        Returns:
            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
            torch.Tensor: for compatibility to RelPositionalEncoding
        """

        self.pe = self.pe.to(x.device)
        pos_emb = self.position_encoding(offset, x.size(1), False)
        x = x * self.xscale + pos_emb
        return self.dropout(x), self.dropout(pos_emb)

    def position_encoding(self,
                          offset: Union[int, torch.Tensor],
                          size: int,
                          apply_dropout: bool = True) -> torch.Tensor:
        """ For getting encoding in a streaming fashion

        Attention!!!!!
        we apply dropout only once at the whole utterance level in a none
        streaming way, but will call this function several times with
        increasing input size in a streaming scenario, so the dropout will
        be applied several times.

        Args:
            offset (int or torch.tensor): start offset
            size (int): required size of position encoding

        Returns:
            torch.Tensor: Corresponding encoding
        """
        # How to subscript a Union type:
        #   https://github.com/pytorch/pytorch/issues/69434
        if isinstance(offset, int):
            assert offset + size <= self.max_len
            pos_emb = self.pe[:, offset:offset + size]
        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
            assert offset + size <= self.max_len
            pos_emb = self.pe[:, offset:offset + size]
        else:  # for batched streaming decoding on GPU
            assert torch.max(offset) + size <= self.max_len
            index = offset.unsqueeze(1) + \
                torch.arange(0, size).to(offset.device)  # B X T
            flag = index > 0
            # remove negative offset
            index = index * flag
            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model

        if apply_dropout:
            pos_emb = self.dropout(pos_emb)
        return pos_emb


class RelPositionalEncoding(PositionalEncoding):
    """Relative positional encoding module.
    See : Appendix B in https://arxiv.org/abs/1901.02860
    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.
    """

    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
        """Initialize class."""
        super().__init__(d_model, dropout_rate, max_len, reverse=True)

    def forward(self,
                x: torch.Tensor,
                offset: Union[int, torch.Tensor] = 0) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute positional encoding.
        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).
        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).
            torch.Tensor: Positional embedding tensor (1, time, `*`).
        """
        self.pe = self.pe.to(x.device)
        x = x * self.xscale
        pos_emb = self.position_encoding(offset, x.size(1), False)
        return self.dropout(x), self.dropout(pos_emb)


class WhisperPositionalEncoding(PositionalEncoding):
    """ Sinusoids position encoding used in openai-whisper.encoder
    """

    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
        super().__init__(d_model, dropout_rate, max_len)
        self.xscale = 1.0
        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
        inv_timescales = torch.exp(-log_timescale_increment *
                                   torch.arange(d_model // 2))
        scaled_time = torch.arange(max_len)[:, np.newaxis] * \
            inv_timescales[np.newaxis, :]
        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
        delattr(self, "pe")
        self.register_buffer("pe", pe.unsqueeze(0))


class LearnablePositionalEncoding(PositionalEncoding):
    """ Learnable position encoding used in openai-whisper.decoder
    """

    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
        super().__init__(d_model, dropout_rate, max_len)
        # NOTE(xcsong): overwrite self.pe & self.xscale
        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
        self.xscale = 1.0


class NoPositionalEncoding(torch.nn.Module):
    """ No position encoding
    """

    def __init__(self, d_model: int, dropout_rate: float):
        super().__init__()
        self.d_model = d_model
        self.dropout = torch.nn.Dropout(p=dropout_rate)

    def forward(self,
                x: torch.Tensor,
                offset: Union[int, torch.Tensor] = 0) \
            -> Tuple[torch.Tensor, torch.Tensor]:
        """ Just return zero vector for interface compatibility
        """
        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
        return self.dropout(x), pos_emb

    def position_encoding(self, offset: Union[int, torch.Tensor],
                          size: int) -> torch.Tensor:
        return torch.zeros(1, size, self.d_model)


class EspnetRelPositionalEncoding(torch.nn.Module):
    """Relative positional encoding module (new implementation).

    Details can be found in https://github.com/espnet/espnet/pull/2816.

    See : Appendix B in https://arxiv.org/abs/1901.02860

    Args:
        d_model (int): Embedding dimension.
        dropout_rate (float): Dropout rate.
        max_len (int): Maximum input length.

    """

    def __init__(self, d_model, dropout_rate, max_len=5000):
        """Construct an PositionalEncoding object."""
        super(EspnetRelPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.xscale = math.sqrt(self.d_model)
        self.dropout = torch.nn.Dropout(p=dropout_rate)
        self.pe = None
        self.extend_pe(torch.tensor(0.0).expand(1, max_len))

    def extend_pe(self, x):
        """Reset the positional encodings."""
        if self.pe is not None:
            # self.pe contains both positive and negative parts
            # the length of self.pe is 2 * input_len - 1
            if self.pe.size(1) >= x.size(1) * 2 - 1:
                if self.pe.dtype != x.dtype or self.pe.device != x.device:
                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
                return
        # Suppose `i` means to the position of query vecotr and `j` means the
        # position of key vector. We use position relative positions when keys
        # are to the left (i>j) and negative relative positions otherwise (i<j).
        pe_positive = torch.zeros(x.size(1), self.d_model)
        pe_negative = torch.zeros(x.size(1), self.d_model)
        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.d_model, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.d_model)
        )
        pe_positive[:, 0::2] = torch.sin(position * div_term)
        pe_positive[:, 1::2] = torch.cos(position * div_term)
        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)

        # Reserve the order of positive indices and concat both positive and
        # negative indices. This is used to support the shifting trick
        # as in https://arxiv.org/abs/1901.02860
        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
        pe_negative = pe_negative[1:].unsqueeze(0)
        pe = torch.cat([pe_positive, pe_negative], dim=1)
        self.pe = pe.to(device=x.device, dtype=x.dtype)

    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
        """Add positional encoding.

        Args:
            x (torch.Tensor): Input tensor (batch, time, `*`).

        Returns:
            torch.Tensor: Encoded tensor (batch, time, `*`).

        """
        self.extend_pe(x)
        x = x * self.xscale
        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
        return self.dropout(x), self.dropout(pos_emb)

    def position_encoding(self,
                          offset: Union[int, torch.Tensor],
                          size: int) -> torch.Tensor:
        """ For getting encoding in a streaming fashion

        Attention!!!!!
        we apply dropout only once at the whole utterance level in a none
        streaming way, but will call this function several times with
        increasing input size in a streaming scenario, so the dropout will
        be applied several times.

        Args:
            offset (int or torch.tensor): start offset
            size (int): required size of position encoding

        Returns:
            torch.Tensor: Corresponding encoding
        """
        pos_emb = self.pe[
            :,
            self.pe.size(1) // 2 - size + 1 : self.pe.size(1) // 2 + size,
        ]
        return pos_emb


================================================
FILE: cosyvoice/transformer/encoder.py
================================================
# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
#               2024 Alibaba Inc (Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from ESPnet(https://github.com/espnet/espnet)
"""Encoder definition."""
from typing import Tuple

import torch
import torch.utils.checkpoint as ckpt

from cosyvoice.transformer.convolution import ConvolutionModule
from cosyvoice.transformer.encoder_layer import TransformerEncoderLayer
from cosyvoice.transformer.encoder_layer import ConformerEncoderLayer
from cosyvoice.transformer.positionwise_feed_forward import PositionwiseFeedForward
from cosyvoice.utils.class_utils import (
    COSYVOICE_EMB_CLASSES,
    COSYVOICE_SUBSAMPLE_CLASSES,
    COSYVOICE_ATTENTION_CLASSES,
    COSYVOICE_ACTIVATION_CLASSES,
)
from cosyvoice.utils.mask import make_pad_mask
from cosyvoice.utils.mask import add_optional_chunk_mask


class BaseEncoder(torch.nn.Module):

    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: str = "conv2d",
        pos_enc_layer_type: str = "abs_pos",
        normalize_before: bool = True,
        static_chunk_size: int = 0,
        use_dynamic_chunk: bool = False,
        global_cmvn: torch.nn.Module = None,
        use_dynamic_left_chunk: bool = False,
        gradient_checkpointing: bool = False,
    ):
        """
        Args:
            input_size (int): input dim
            output_size (int): dimension of attention
            attention_heads (int): the number of heads of multi head attention
            linear_units (int): the hidden units number of position-wise feed
                forward
            num_blocks (int): the number of decoder blocks
            dropout_rate (float): dropout rate
            attention_dropout_rate (float): dropout rate in attention
            positional_dropout_rate (float): dropout rate after adding
                positional encoding
            input_layer (str): input layer type.
                optional [linear, conv2d, conv2d6, conv2d8]
            pos_enc_layer_type (str): Encoder positional encoding layer type.
                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
            normalize_before (bool):
                True: use layer_norm before each sub-block of a layer.
                False: use layer_norm after each sub-block of a layer.
            static_chunk_size (int): chunk size for static chunk training and
                decoding
            use_dynamic_chunk (bool): whether use dynamic chunk size for
                training or not, You can only use fixed chunk(chunk_size > 0)
                or dyanmic chunk size(use_dynamic_chunk = True)
            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                dynamic chunk training
            key_bias: whether use bias in attention.linear_k, False for whisper models.
            gradient_checkpointing: rerunning a forward-pass segment for each
                checkpointed segment during backward.
        """
        super().__init__()
        self._output_size = output_size

        self.global_cmvn = global_cmvn
        self.embed = COSYVOICE_SUBSAMPLE_CLASSES[input_layer](
            input_size,
            output_size,
            dropout_rate,
            COSYVOICE_EMB_CLASSES[pos_enc_layer_type](output_size,
                                                      positional_dropout_rate),
        )

        self.normalize_before = normalize_before
        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
        self.static_chunk_size = static_chunk_size
        self.use_dynamic_chunk = use_dynamic_chunk
        self.use_dynamic_left_chunk = use_dynamic_left_chunk
        self.gradient_checkpointing = gradient_checkpointing

    def output_size(self) -> int:
        return self._output_size

    def forward(
        self,
        xs: torch.Tensor,
        xs_lens: torch.Tensor,
        decoding_chunk_size: int = 0,
        num_decoding_left_chunks: int = -1,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Embed positions in tensor.

        Args:
            xs: padded input tensor (B, T, D)
            xs_lens: input length (B)
            decoding_chunk_size: decoding chunk size for dynamic chunk
                0: default for training, use random dynamic chunk.
                <0: for decoding, use full chunk.
                >0: for decoding, use fixed chunk size as set.
            num_decoding_left_chunks: number of left chunks, this is for decoding,
            the chunk size is decoding_chunk_size.
                >=0: use num_decoding_left_chunks
                <0: use all left chunks
        Returns:
            encoder output tensor xs, and subsampled masks
            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
            masks: torch.Tensor batch padding mask after subsample
                (B, 1, T' ~= T/subsample_rate)
        NOTE(xcsong):
            We pass the `__call__` method of the modules instead of `forward` to the
            checkpointing API because `__call__` attaches all the hooks of the module.
            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
        """
        T = xs.size(1)
        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        xs, pos_emb, masks = self.embed(xs, masks)
        mask_pad = masks  # (B, 1, T/subsample_rate)
        chunk_masks = add_optional_chunk_mask(xs, masks,
                                              self.use_dynamic_chunk,
                                              self.use_dynamic_left_chunk,
                                              decoding_chunk_size,
                                              self.static_chunk_size,
                                              num_decoding_left_chunks)
        if self.gradient_checkpointing and self.training:
            xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
                                                  mask_pad)
        else:
            xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
        if self.normalize_before:
            xs = self.after_norm(xs)
        # Here we assume the mask is not changed in encoder layers, so just
        # return the masks before encoder layers, and the masks will be used
        # for cross attention with decoder later
        return xs, masks

    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
                       pos_emb: torch.Tensor,
                       mask_pad: torch.Tensor) -> torch.Tensor:
        for layer in self.encoders:
            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
        return xs

    @torch.jit.ignore(drop=True)
    def forward_layers_checkpointed(self, xs: torch.Tensor,
                                    chunk_masks: torch.Tensor,
                                    pos_emb: torch.Tensor,
                                    mask_pad: torch.Tensor) -> torch.Tensor:
        for layer in self.encoders:
            xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs,
                                                    chunk_masks, pos_emb,
                                                    mask_pad)
        return xs

    def forward_chunk(
        self,
        xs: torch.Tensor,
        offset: int,
        required_cache_size: int,
        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """ Forward just one chunk

        Args:
            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
                where `time == (chunk_size - 1) * subsample_rate + \
                        subsample.right_context + 1`
            offset (int): current offset in encoder output time stamp
            required_cache_size (int): cache size required for next chunk
                compuation
                >=0: actual cache size
                <0: means all history cache is required
            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
                transformer/conformer attention, with shape
                (elayers, head, cache_t1, d_k * 2), where
                `head * d_k == hidden-dim` and
                `cache_t1 == chunk_size * num_decoding_left_chunks`.
            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
                (elayers, b=1, hidden-dim, cache_t2), where
                `cache_t2 == cnn.lorder - 1`

        Returns:
            torch.Tensor: output of current input xs,
                with shape (b=1, chunk_size, hidden-dim).
            torch.Tensor: new attention cache required for next chunk, with
                dynamic shape (elayers, head, ?, d_k * 2)
                depending on required_cache_size.
            torch.Tensor: new conformer cnn cache required for next chunk, with
                same shape as the original cnn_cache.

        """
        assert xs.size(0) == 1
        # tmp_masks is just for interface compatibility
        tmp_masks = torch.ones(1,
                               xs.size(1),
                               device=xs.device,
                               dtype=torch.bool)
        tmp_masks = tmp_masks.unsqueeze(1)
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
        chunk_size = xs.size(1)
        attention_key_size = cache_t1 + chunk_size
        pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
                                               size=attention_key_size)
        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
            next_cache_start = attention_key_size
        else:
            next_cache_start = max(attention_key_size - required_cache_size, 0)
        r_att_cache = []
        r_cnn_cache = []
        for i, layer in enumerate(self.encoders):
            # NOTE(xcsong): Before layer.forward
            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
            xs, _, new_att_cache, new_cnn_cache = layer(
                xs,
                att_mask,
                pos_emb,
                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
            # NOTE(xcsong): After layer.forward
            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
        if self.normalize_before:
            xs = self.after_norm(xs)

        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
        #   ? may be larger than cache_t1, it depends on required_cache_size
        r_att_cache = torch.cat(r_att_cache, dim=0)
        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)

        return (xs, r_att_cache, r_cnn_cache)

    def forward_chunk_by_chunk(
        self,
        xs: torch.Tensor,
        decoding_chunk_size: int,
        num_decoding_left_chunks: int = -1,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """ Forward input chunk by chunk with chunk_size like a streaming
            fashion

        Here we should pay special attention to computation cache in the
        streaming style forward chunk by chunk. Three things should be taken
        into account for computation in the current network:
            1. transformer/conformer encoder layers output cache
            2. convolution in conformer
            3. convolution in subsampling

        However, we don't implement subsampling cache for:
            1. We can control subsampling module to output the right result by
               overlapping input instead of cache left context, even though it
               wastes some computation, but subsampling only takes a very
               small fraction of computation in the whole model.
            2. Typically, there are several covolution layers with subsampling
               in subsampling module, it is tricky and complicated to do cache
               with different convolution layers with different subsampling
               rate.
            3. Currently, nn.Sequential is used to stack all the convolution
               layers in subsampling, we need to rewrite it to make it work
               with cache, which is not preferred.
        Args:
            xs (torch.Tensor): (1, max_len, dim)
            chunk_size (int): decoding chunk size
        """
        assert decoding_chunk_size > 0
        # The model is trained by static or dynamic chunk
        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
        subsampling = self.embed.subsampling_rate
        context = self.embed.right_context + 1  # Add current frame
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context
        num_frames = xs.size(1)
        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
        outputs = []
        offset = 0
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks

        # Feed forward overlap input step by step
        for cur in range(0, num_frames - context + 1, stride):
            end = min(cur + decoding_window, num_frames)
            chunk_xs = xs[:, cur:end, :]
            (y, att_cache,
             cnn_cache) = self.forward_chunk(chunk_xs, offset,
                                             required_cache_size, att_cache,
                                             cnn_cache)
            outputs.append(y)
            offset += y.size(1)
        ys = torch.cat(outputs, 1)
        masks = torch.ones((1, 1, ys.size(1)),
                           device=ys.device,
                           dtype=torch.bool)
        return ys, masks


class TransformerEncoder(BaseEncoder):
    """Transformer encoder module."""

    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: str = "conv2d",
        pos_enc_layer_type: str = "abs_pos",
        normalize_before: bool = True,
        static_chunk_size: int = 0,
        use_dynamic_chunk: bool = False,
        global_cmvn: torch.nn.Module = None,
        use_dynamic_left_chunk: bool = False,
        key_bias: bool = True,
        selfattention_layer_type: str = "selfattn",
        activation_type: str = "relu",
        gradient_checkpointing: bool = False,
    ):
        """ Construct TransformerEncoder

        See Encoder for the meaning of each parameter.
        """
        super().__init__(input_size, output_size, attention_heads,
                         linear_units, num_blocks, dropout_rate,
                         positional_dropout_rate, attention_dropout_rate,
                         input_layer, pos_enc_layer_type, normalize_before,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk, gradient_checkpointing)
        activation = COSYVOICE_ACTIVATION_CLASSES[activation_type]()
        self.encoders = torch.nn.ModuleList([
            TransformerEncoderLayer(
                output_size,
                COSYVOICE_ATTENTION_CLASSES[selfattention_layer_type](attention_heads,
                                                                      output_size,
                                                                      attention_dropout_rate,
                                                                      key_bias),
                PositionwiseFeedForward(output_size, linear_units,
                                        dropout_rate, activation),
                dropout_rate, normalize_before) for _ in range(num_blocks)
        ])


class ConformerEncoder(BaseEncoder):
    """Conformer encoder module."""

    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        
Download .txt
gitextract_wldclo_i/

├── .dockerignore
├── Dockerfile
├── LICENSE
├── README.md
├── api.py
├── batch_inference.py
├── compose.yaml
├── cosyvoice/
│   ├── __init__.py
│   ├── bin/
│   │   ├── inference.py
│   │   └── train.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── cosyvoice.py
│   │   ├── frontend.py
│   │   └── model.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── dataset.py
│   │   └── processor.py
│   ├── flow/
│   │   ├── decoder.py
│   │   ├── flow.py
│   │   ├── flow_matching.py
│   │   └── length_regulator.py
│   ├── hifigan/
│   │   ├── f0_predictor.py
│   │   └── generator.py
│   ├── llm/
│   │   └── llm.py
│   ├── transformer/
│   │   ├── __init__.py
│   │   ├── activation.py
│   │   ├── attention.py
│   │   ├── convolution.py
│   │   ├── decoder.py
│   │   ├── decoder_layer.py
│   │   ├── embedding.py
│   │   ├── encoder.py
│   │   ├── encoder_layer.py
│   │   ├── label_smoothing_loss.py
│   │   ├── positionwise_feed_forward.py
│   │   └── subsampling.py
│   └── utils/
│       ├── __init__.py
│       ├── class_utils.py
│       ├── common.py
│       ├── executor.py
│       ├── file_utils.py
│       ├── frontend_utils.py
│       ├── mask.py
│       ├── scheduler.py
│       └── train_utils.py
├── data/
│   └── batch_files.csv
├── openai_api_inference.py
├── requirements.txt
├── results/
│   └── .gitkeep
├── run_batch_inference.sh
├── run_single_inference.sh
├── single_inference.py
├── third_party/
│   └── Matcha-TTS/
│       ├── LICENSE
│       ├── MANIFEST.in
│       ├── Makefile
│       ├── README.md
│       ├── configs/
│       │   ├── __init__.py
│       │   ├── callbacks/
│       │   │   ├── default.yaml
│       │   │   ├── model_checkpoint.yaml
│       │   │   ├── model_summary.yaml
│       │   │   ├── none.yaml
│       │   │   └── rich_progress_bar.yaml
│       │   ├── data/
│       │   │   ├── hi-fi_en-US_female.yaml
│       │   │   ├── ljspeech.yaml
│       │   │   └── vctk.yaml
│       │   ├── debug/
│       │   │   ├── default.yaml
│       │   │   ├── fdr.yaml
│       │   │   ├── limit.yaml
│       │   │   ├── overfit.yaml
│       │   │   └── profiler.yaml
│       │   ├── eval.yaml
│       │   ├── experiment/
│       │   │   ├── hifi_dataset_piper_phonemizer.yaml
│       │   │   ├── ljspeech.yaml
│       │   │   ├── ljspeech_min_memory.yaml
│       │   │   └── multispeaker.yaml
│       │   ├── extras/
│       │   │   └── default.yaml
│       │   ├── hparams_search/
│       │   │   └── mnist_optuna.yaml
│       │   ├── hydra/
│       │   │   └── default.yaml
│       │   ├── local/
│       │   │   └── .gitkeep
│       │   ├── logger/
│       │   │   ├── aim.yaml
│       │   │   ├── comet.yaml
│       │   │   ├── csv.yaml
│       │   │   ├── many_loggers.yaml
│       │   │   ├── mlflow.yaml
│       │   │   ├── neptune.yaml
│       │   │   ├── tensorboard.yaml
│       │   │   └── wandb.yaml
│       │   ├── model/
│       │   │   ├── cfm/
│       │   │   │   └── default.yaml
│       │   │   ├── decoder/
│       │   │   │   └── default.yaml
│       │   │   ├── encoder/
│       │   │   │   └── default.yaml
│       │   │   ├── matcha.yaml
│       │   │   └── optimizer/
│       │   │       └── adam.yaml
│       │   ├── paths/
│       │   │   └── default.yaml
│       │   ├── train.yaml
│       │   └── trainer/
│       │       ├── cpu.yaml
│       │       ├── ddp.yaml
│       │       ├── ddp_sim.yaml
│       │       ├── default.yaml
│       │       ├── gpu.yaml
│       │       └── mps.yaml
│       ├── matcha/
│       │   ├── VERSION
│       │   ├── __init__.py
│       │   ├── app.py
│       │   ├── cli.py
│       │   ├── data/
│       │   │   ├── __init__.py
│       │   │   ├── components/
│       │   │   │   └── __init__.py
│       │   │   └── text_mel_datamodule.py
│       │   ├── hifigan/
│       │   │   ├── LICENSE
│       │   │   ├── README.md
│       │   │   ├── __init__.py
│       │   │   ├── config.py
│       │   │   ├── denoiser.py
│       │   │   ├── env.py
│       │   │   ├── meldataset.py
│       │   │   ├── models.py
│       │   │   └── xutils.py
│       │   ├── models/
│       │   │   ├── __init__.py
│       │   │   ├── baselightningmodule.py
│       │   │   ├── components/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── decoder.py
│       │   │   │   ├── flow_matching.py
│       │   │   │   ├── text_encoder.py
│       │   │   │   └── transformer.py
│       │   │   └── matcha_tts.py
│       │   ├── onnx/
│       │   │   ├── __init__.py
│       │   │   ├── export.py
│       │   │   └── infer.py
│       │   ├── text/
│       │   │   ├── __init__.py
│       │   │   ├── cleaners.py
│       │   │   ├── numbers.py
│       │   │   └── symbols.py
│       │   ├── train.py
│       │   └── utils/
│       │       ├── __init__.py
│       │       ├── audio.py
│       │       ├── generate_data_statistics.py
│       │       ├── instantiators.py
│       │       ├── logging_utils.py
│       │       ├── model.py
│       │       ├── monotonic_align/
│       │       │   ├── __init__.py
│       │       │   ├── core.c
│       │       │   ├── core.pyx
│       │       │   └── setup.py
│       │       ├── pylogger.py
│       │       ├── rich_utils.py
│       │       └── utils.py
│       ├── matcha_tts.egg-info/
│       │   ├── PKG-INFO
│       │   ├── SOURCES.txt
│       │   ├── dependency_links.txt
│       │   ├── entry_points.txt
│       │   ├── requires.txt
│       │   └── top_level.txt
│       ├── notebooks/
│       │   └── .gitkeep
│       ├── pyproject.toml
│       ├── requirements.txt
│       ├── scripts/
│       │   └── schedule.sh
│       ├── setup.py
│       └── synthesis.ipynb
└── utils/
    └── word_utils.py
Download .txt
SYMBOL INDEX (1094 symbols across 65 files)

FILE: api.py
  class Settings (line 17) | class Settings(BaseSettings):
  class SpeechRequest (line 36) | class SpeechRequest(BaseModel):
  function lifespan (line 47) | async def lifespan(app: FastAPI):
  function get_models (line 63) | async def get_models(request: Request):
  function speach_endpoint (line 78) | async def speach_endpoint(request: Request, payload: SpeechRequest):

FILE: batch_inference.py
  function process_batch (line 11) | def process_batch(csv_file, speaker_prompt_audio_folder, output_audio_fo...
  function main (line 53) | def main():

FILE: cosyvoice/bin/inference.py
  function get_args (line 31) | def get_args():
  function main (line 54) | def main():

FILE: cosyvoice/bin/train.py
  function get_args (line 38) | def get_args():
  function main (line 85) | def main():

FILE: cosyvoice/cli/cosyvoice.py
  class CosyVoice (line 21) | class CosyVoice:
    method __init__ (line 23) | def __init__(self, model_dir):
    method list_avaliable_spks (line 43) | def list_avaliable_spks(self):
    method inference_sft (line 47) | def inference_sft(self, tts_text, spk_id):
    method inference_zero_shot (line 55) | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
    method inference_cross_lingual (line 64) | def inference_cross_lingual(self, tts_text, prompt_speech_16k):
    method inference_instruct (line 74) | def inference_instruct(self, tts_text, spk_id, instruct_text):

FILE: cosyvoice/cli/frontend.py
  class CosyVoiceFrontEnd (line 36) | class CosyVoiceFrontEnd:
    method __init__ (line 38) | def __init__(self,
    method _extract_text_token (line 72) | def _extract_text_token(self, text):
    method _extract_speech_token (line 78) | def _extract_speech_token(self, speech):
    method _extract_spk_embedding (line 86) | def _extract_spk_embedding(self, speech):
    method _extract_speech_feat (line 96) | def _extract_speech_feat(self, speech):
    method text_normalize (line 102) | def text_normalize(self, text, split=True):
    method frontend_sft (line 132) | def frontend_sft(self, tts_text, spk_id):
    method frontend_zero_shot (line 138) | def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
    method frontend_cross_lingual (line 153) | def frontend_cross_lingual(self, tts_text, prompt_speech_16k):
    method frontend_instruct (line 162) | def frontend_instruct(self, tts_text, spk_id, instruct_text):

FILE: cosyvoice/cli/model.py
  class CosyVoiceModel (line 16) | class CosyVoiceModel:
    method __init__ (line 18) | def __init__(self,
    method load (line 27) | def load(self, llm_model, flow_model, hift_model):
    method inference (line 35) | def inference(self, text, text_len, flow_embedding, llm_embedding=torc...

FILE: cosyvoice/dataset/dataset.py
  class Processor (line 27) | class Processor(IterableDataset):
    method __init__ (line 29) | def __init__(self, source, f, *args, **kw):
    method set_epoch (line 36) | def set_epoch(self, epoch):
    method __iter__ (line 39) | def __iter__(self):
    method apply (line 47) | def apply(self, f):
  class DistributedSampler (line 52) | class DistributedSampler:
    method __init__ (line 54) | def __init__(self, shuffle=True, partition=True):
    method update (line 60) | def update(self):
    method set_epoch (line 80) | def set_epoch(self, epoch):
    method sample (line 83) | def sample(self, data):
  class DataList (line 108) | class DataList(IterableDataset):
    method __init__ (line 110) | def __init__(self, lists, shuffle=True, partition=True):
    method set_epoch (line 114) | def set_epoch(self, epoch):
    method __iter__ (line 117) | def __iter__(self):
  function Dataset (line 126) | def Dataset(data_list_file,

FILE: cosyvoice/dataset/processor.py
  function parquet_opener (line 29) | def parquet_opener(data, mode='train', tts_data={}):
  function filter (line 57) | def filter(data,
  function resample (line 108) | def resample(data, resample_rate=22050, min_sample_rate=16000, mode='tra...
  function compute_fbank (line 136) | def compute_fbank(data,
  function parse_embedding (line 159) | def parse_embedding(data, normalize, mode='train'):
  function tokenize (line 177) | def tokenize(data, get_tokenizer, allowed_special, mode='train'):
  function shuffle (line 196) | def shuffle(data, shuffle_size=10000, mode='train'):
  function sort (line 220) | def sort(data, sort_size=500, mode='train'):
  function static_batch (line 248) | def static_batch(data, batch_size=16):
  function dynamic_batch (line 268) | def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
  function batch (line 297) | def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=...
  function padding (line 311) | def padding(data, use_spk_embedding, mode='train'):

FILE: cosyvoice/flow/decoder.py
  class ConditionalDecoder (line 21) | class ConditionalDecoder(nn.Module):
    method __init__ (line 22) | def __init__(
    method initialize_weights (line 130) | def initialize_weights(self):
    method forward (line 144) | def forward(self, x, mask, mu, t, spks=None, cond=None):

FILE: cosyvoice/flow/flow.py
  class MaskedDiffWithXvec (line 24) | class MaskedDiffWithXvec(torch.nn.Module):
    method __init__ (line 25) | def __init__(self,
    method forward (line 55) | def forward(
    method inference (line 100) | def inference(self,

FILE: cosyvoice/flow/flow_matching.py
  class ConditionalCFM (line 18) | class ConditionalCFM(BASECFM):
    method __init__ (line 19) | def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, ...
    method forward (line 34) | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, c...
    method solve_euler (line 58) | def solve_euler(self, x, t_span, mu, mask, spks, cond):
    method compute_loss (line 99) | def compute_loss(self, x1, mask, mu, spks=None, cond=None):

FILE: cosyvoice/flow/length_regulator.py
  class InterpolateRegulator (line 20) | class InterpolateRegulator(nn.Module):
    method __init__ (line 21) | def __init__(
    method forward (line 43) | def forward(self, x, ylens=None):

FILE: cosyvoice/hifigan/f0_predictor.py
  class ConvRNNF0Predictor (line 19) | class ConvRNNF0Predictor(nn.Module):
    method __init__ (line 20) | def __init__(self,
    method forward (line 52) | def forward(self, x: torch.Tensor) -> torch.Tensor:

FILE: cosyvoice/hifigan/generator.py
  class ResBlock (line 41) | class ResBlock(torch.nn.Module):
    method __init__ (line 43) | def __init__(
    method forward (line 89) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method remove_weight_norm (line 98) | def remove_weight_norm(self):
  class SineGen (line 103) | class SineGen(torch.nn.Module):
    method __init__ (line 119) | def __init__(self, samp_rate, harmonic_num=0,
    method _f02uv (line 129) | def _f02uv(self, f0):
    method forward (line 135) | def forward(self, f0):
  class SourceModuleHnNSF (line 168) | class SourceModuleHnNSF(torch.nn.Module):
    method __init__ (line 186) | def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine...
    method forward (line 201) | def forward(self, x):
  class HiFTGenerator (line 220) | class HiFTGenerator(nn.Module):
    method __init__ (line 225) | def __init__(
    method _f02source (line 317) | def _f02source(self, f0: torch.Tensor) -> torch.Tensor:
    method _stft (line 323) | def _stft(self, x):
    method _istft (line 331) | def _istft(self, magnitude, phase):
    method forward (line 338) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method remove_weight_norm (line 375) | def remove_weight_norm(self):
    method inference (line 390) | def inference(self, mel: torch.Tensor) -> torch.Tensor:

FILE: cosyvoice/llm/llm.py
  class TransformerLM (line 24) | class TransformerLM(torch.nn.Module):
    method __init__ (line 25) | def __init__(
    method encode (line 66) | def encode(
    method pad_unpad_sequence (line 76) | def pad_unpad_sequence(self, sos_eos_emb, embedding, text_token, text_...
    method forward (line 84) | def forward(
    method sampling_ids (line 132) | def sampling_ids(
    method inference (line 148) | def inference(

FILE: cosyvoice/transformer/activation.py
  class Swish (line 24) | class Swish(torch.nn.Module):
    method forward (line 27) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class Snake (line 34) | class Snake(nn.Module):
    method __init__ (line 50) | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha...
    method forward (line 73) | def forward(self, x):

FILE: cosyvoice/transformer/attention.py
  class MultiHeadedAttention (line 26) | class MultiHeadedAttention(nn.Module):
    method __init__ (line 36) | def __init__(self,
    method forward_qkv (line 53) | def forward_qkv(
    method forward_attention (line 82) | def forward_attention(
    method forward (line 129) | def forward(
  class RelPositionMultiHeadedAttention (line 200) | class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    method __init__ (line 209) | def __init__(self,
    method rel_shift (line 225) | def rel_shift(self, x):
    method forward (line 245) | def forward(

FILE: cosyvoice/transformer/convolution.py
  class ConvolutionModule (line 24) | class ConvolutionModule(nn.Module):
    method __init__ (line 27) | def __init__(self,
    method forward (line 90) | def forward(

FILE: cosyvoice/transformer/decoder.py
  class TransformerDecoder (line 33) | class TransformerDecoder(torch.nn.Module):
    method __init__ (line 58) | def __init__(
    method forward (line 116) | def forward(
    method forward_layers (line 169) | def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
    method forward_layers_checkpointed (line 178) | def forward_layers_checkpointed(self, x: torch.Tensor,
    method forward_one_step (line 187) | def forward_one_step(
    method tie_or_clone_weights (line 230) | def tie_or_clone_weights(self, jit_mode: bool = True):
  class BiTransformerDecoder (line 256) | class BiTransformerDecoder(torch.nn.Module):
    method __init__ (line 276) | def __init__(
    method forward (line 332) | def forward(
    method forward_one_step (line 367) | def forward_one_step(
    method tie_or_clone_weights (line 392) | def tie_or_clone_weights(self, jit_mode: bool = True):

FILE: cosyvoice/transformer/decoder_layer.py
  class DecoderLayer (line 22) | class DecoderLayer(nn.Module):
    method __init__ (line 41) | def __init__(
    method forward (line 62) | def forward(

FILE: cosyvoice/transformer/embedding.py
  class PositionalEncoding (line 26) | class PositionalEncoding(torch.nn.Module):
    method __init__ (line 37) | def __init__(self,
    method forward (line 59) | def forward(self,
    method position_encoding (line 79) | def position_encoding(self,
  class RelPositionalEncoding (line 120) | class RelPositionalEncoding(PositionalEncoding):
    method __init__ (line 129) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5...
    method forward (line 133) | def forward(self,
  class WhisperPositionalEncoding (line 150) | class WhisperPositionalEncoding(PositionalEncoding):
    method __init__ (line 154) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1...
  class LearnablePositionalEncoding (line 167) | class LearnablePositionalEncoding(PositionalEncoding):
    method __init__ (line 171) | def __init__(self, d_model: int, dropout_rate: float, max_len: int = 4...
  class NoPositionalEncoding (line 178) | class NoPositionalEncoding(torch.nn.Module):
    method __init__ (line 182) | def __init__(self, d_model: int, dropout_rate: float):
    method forward (line 187) | def forward(self,
    method position_encoding (line 196) | def position_encoding(self, offset: Union[int, torch.Tensor],
  class EspnetRelPositionalEncoding (line 201) | class EspnetRelPositionalEncoding(torch.nn.Module):
    method __init__ (line 215) | def __init__(self, d_model, dropout_rate, max_len=5000):
    method extend_pe (line 224) | def extend_pe(self, x):
    method forward (line 256) | def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0):
    method position_encoding (line 271) | def position_encoding(self,

FILE: cosyvoice/transformer/encoder.py
  class BaseEncoder (line 37) | class BaseEncoder(torch.nn.Module):
    method __init__ (line 39) | def __init__(
    method output_size (line 108) | def output_size(self) -> int:
    method forward (line 111) | def forward(
    method forward_layers (line 165) | def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
    method forward_layers_checkpointed (line 173) | def forward_layers_checkpointed(self, xs: torch.Tensor,
    method forward_chunk (line 183) | def forward_chunk(
    method forward_chunk_by_chunk (line 273) | def forward_chunk_by_chunk(
  class TransformerEncoder (line 336) | class TransformerEncoder(BaseEncoder):
    method __init__ (line 339) | def __init__(
  class ConformerEncoder (line 385) | class ConformerEncoder(BaseEncoder):
    method __init__ (line 388) | def __init__(

FILE: cosyvoice/transformer/encoder_layer.py
  class TransformerEncoderLayer (line 24) | class TransformerEncoderLayer(nn.Module):
    method __init__ (line 40) | def __init__(
    method forward (line 58) | def forward(
  class ConformerEncoderLayer (line 109) | class ConformerEncoderLayer(nn.Module):
    method __init__ (line 129) | def __init__(
    method forward (line 160) | def forward(

FILE: cosyvoice/transformer/label_smoothing_loss.py
  class LabelSmoothingLoss (line 21) | class LabelSmoothingLoss(nn.Module):
    method __init__ (line 54) | def __init__(self,
    method forward (line 68) | def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:

FILE: cosyvoice/transformer/positionwise_feed_forward.py
  class PositionwiseFeedForward (line 20) | class PositionwiseFeedForward(torch.nn.Module):
    method __init__ (line 33) | def __init__(
    method forward (line 47) | def forward(self, xs: torch.Tensor) -> torch.Tensor:
  class MoEFFNLayer (line 58) | class MoEFFNLayer(torch.nn.Module):
    method __init__ (line 75) | def __init__(
    method forward (line 91) | def forward(self, xs: torch.Tensor) -> torch.Tensor:

FILE: cosyvoice/transformer/subsampling.py
  class BaseSubsampling (line 23) | class BaseSubsampling(torch.nn.Module):
    method __init__ (line 25) | def __init__(self):
    method position_encoding (line 30) | def position_encoding(self, offset: Union[int, torch.Tensor],
  class EmbedinigNoSubsampling (line 35) | class EmbedinigNoSubsampling(BaseSubsampling):
    method __init__ (line 39) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 45) | def forward(
  class LinearNoSubsampling (line 69) | class LinearNoSubsampling(BaseSubsampling):
    method __init__ (line 79) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 92) | def forward(
  class Conv1dSubsampling2 (line 116) | class Conv1dSubsampling2(BaseSubsampling):
    method __init__ (line 128) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 145) | def forward(
  class Conv2dSubsampling4 (line 173) | class Conv2dSubsampling4(BaseSubsampling):
    method __init__ (line 183) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 202) | def forward(
  class Conv2dSubsampling6 (line 230) | class Conv2dSubsampling6(BaseSubsampling):
    method __init__ (line 239) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 256) | def forward(
  class Conv2dSubsampling8 (line 282) | class Conv2dSubsampling8(BaseSubsampling):
    method __init__ (line 292) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 311) | def forward(
  class LegacyLinearNoSubsampling (line 338) | class LegacyLinearNoSubsampling(BaseSubsampling):
    method __init__ (line 348) | def __init__(self, idim: int, odim: int, dropout_rate: float,
    method forward (line 362) | def forward(

FILE: cosyvoice/utils/common.py
  function pad_list (line 25) | def pad_list(xs: List[torch.Tensor], pad_value: int):
  function th_accuracy (line 74) | def th_accuracy(pad_outputs: torch.Tensor, pad_targets: torch.Tensor,
  function get_padding (line 96) | def get_padding(kernel_size, dilation=1):
  function init_weights (line 100) | def init_weights(m, mean=0.0, std=0.01):

FILE: cosyvoice/utils/executor.py
  class Executor (line 26) | class Executor:
    method __init__ (line 28) | def __init__(self):
    method train_one_epoc (line 34) | def train_one_epoc(self, model, optimizer, scheduler, train_data_loade...
    method cv (line 83) | def cv(self, model, cv_data_loader, writer, info_dict, on_batch_end=Tr...

FILE: cosyvoice/utils/file_utils.py
  function read_lists (line 20) | def read_lists(list_file):
  function read_json_lists (line 27) | def read_json_lists(list_file):
  function load_wav (line 35) | def load_wav(wav, target_sr):
  function speed_change (line 43) | def speed_change(waveform, sample_rate, speed_factor: str):

FILE: cosyvoice/utils/frontend_utils.py
  function contains_chinese (line 19) | def contains_chinese(text):
  function replace_corner_mark (line 24) | def replace_corner_mark(text):
  function remove_bracket (line 31) | def remove_bracket(text):
  function spell_out_number (line 40) | def spell_out_number(text: str, inflect_parser):
  function split_paragraph (line 63) | def split_paragraph(text: str, tokenize, lang="zh", token_max_n=80, toke...
  function replace_blank (line 116) | def replace_blank(text: str):

FILE: cosyvoice/utils/mask.py
  function subsequent_mask (line 53) | def subsequent_mask(
  function subsequent_chunk_mask (line 89) | def subsequent_chunk_mask(
  function add_optional_chunk_mask (line 127) | def add_optional_chunk_mask(xs: torch.Tensor,
  function make_pad_mask (line 201) | def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:

FILE: cosyvoice/utils/scheduler.py
  class WarmupLR (line 27) | class WarmupLR(_LRScheduler):
    method __init__ (line 44) | def __init__(
    method __repr__ (line 56) | def __repr__(self):
    method get_lr (line 59) | def get_lr(self):
    method set_step (line 70) | def set_step(self, step: int):
  class WarmupPolicy (line 74) | class WarmupPolicy(_LRScheduler):
    method __init__ (line 84) | def __init__(self,
    method get_lr (line 110) | def get_lr(self):
    method _get_warmup_lr (line 128) | def _get_warmup_lr(self, step):
    method _get_lr (line 132) | def _get_lr(self, step):
  class SquareRootConstantPolicy (line 137) | class SquareRootConstantPolicy(_LRScheduler):
    method __init__ (line 147) | def __init__(self,
    method get_lr (line 175) | def get_lr(self):
    method _get_lr (line 193) | def _get_lr(self, step):
  class WarmupHoldPolicy (line 198) | class WarmupHoldPolicy(WarmupPolicy):
    method __init__ (line 212) | def __init__(
    method get_lr (line 257) | def get_lr(self):
  class WarmupAnnealHoldPolicy (line 282) | class WarmupAnnealHoldPolicy(_LRScheduler):
    method __init__ (line 295) | def __init__(
    method get_lr (line 340) | def get_lr(self):
    method _get_warmup_lr (line 365) | def _get_warmup_lr(self, step):
    method _get_constant_lr (line 369) | def _get_constant_lr(self, step):
    method _get_lr (line 372) | def _get_lr(self, step):
  function _squareroot_annealing (line 377) | def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
  function _square_annealing (line 384) | def _square_annealing(initial_lr, step, max_steps, min_lr):
  function _cosine_annealing (line 391) | def _cosine_annealing(initial_lr, step, max_steps, min_lr):
  function _linear_warmup_with_cosine_annealing (line 397) | def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step,
  function _poly_decay (line 421) | def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
  function _noam_hold_annealing (line 433) | def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps,
  class SquareAnnealing (line 444) | class SquareAnnealing(WarmupPolicy):
    method __init__ (line 446) | def __init__(self,
    method _get_lr (line 459) | def _get_lr(self, step):
  class SquareRootAnnealing (line 471) | class SquareRootAnnealing(WarmupPolicy):
    method __init__ (line 473) | def __init__(self,
    method _get_lr (line 486) | def _get_lr(self, step):
  class CosineAnnealing (line 497) | class CosineAnnealing(WarmupAnnealHoldPolicy):
    method __init__ (line 499) | def __init__(self,
    method _get_lr (line 512) | def _get_lr(self, step):
    method _get_warmup_lr (line 532) | def _get_warmup_lr(self, step):
    method _get_constant_lr (line 539) | def _get_constant_lr(self, step):
    method _get_linear_warmup_with_cosine_annealing_lr (line 543) | def _get_linear_warmup_with_cosine_annealing_lr(self, step):
  class NoamAnnealing (line 558) | class NoamAnnealing(_LRScheduler):
    method __init__ (line 560) | def __init__(self,
    method get_lr (line 589) | def get_lr(self):
    method _noam_annealing (line 611) | def _noam_annealing(self, initial_lr, step):
  class NoamHoldAnnealing (line 624) | class NoamHoldAnnealing(WarmupHoldPolicy):
    method __init__ (line 626) | def __init__(self,
    method _get_lr (line 694) | def _get_lr(self, step):
    method set_step (line 716) | def set_step(self, step: int):
  class ConstantLR (line 720) | class ConstantLR(_LRScheduler):
    method __init__ (line 727) | def __init__(
    method get_lr (line 735) | def get_lr(self):
    method set_step (line 738) | def set_step(self, step: int):

FILE: cosyvoice/utils/train_utils.py
  function init_distributed (line 40) | def init_distributed(args):
  function init_dataset_and_dataloader (line 54) | def init_dataset_and_dataloader(args, configs):
  function check_modify_and_save_config (line 73) | def check_modify_and_save_config(args, configs):
  function wrap_cuda_model (line 94) | def wrap_cuda_model(args, model):
  function init_optimizer_and_scheduler (line 111) | def init_optimizer_and_scheduler(args, configs, model):
  function init_summarywriter (line 145) | def init_summarywriter(args):
  function save_model (line 153) | def save_model(model, model_name, info_dict):
  function cosyvoice_join (line 175) | def cosyvoice_join(group_join, info_dict):
  function batch_forward (line 196) | def batch_forward(model, batch, info_dict):
  function batch_backward (line 217) | def batch_backward(model, info_dict):
  function update_parameter_and_lr (line 228) | def update_parameter_and_lr(model, optimizer, scheduler, info_dict):
  function log_per_step (line 245) | def log_per_step(writer, info_dict):
  function log_per_save (line 274) | def log_per_save(writer, info_dict):

FILE: single_inference.py
  class CustomCosyVoiceFrontEnd (line 29) | class CustomCosyVoiceFrontEnd(CosyVoiceFrontEnd):
    method text_normalize_new (line 30) | def text_normalize_new(self,text, split=False):
    method frontend_zero_shot (line 112) | def frontend_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
    method frontend_zero_shot_dual (line 127) | def frontend_zero_shot_dual(self, tts_text, prompt_text, prompt_speech...
  class CustomCosyVoiceModel (line 150) | class CustomCosyVoiceModel(CosyVoiceModel):
    method __init__ (line 152) | def __init__(self,
    method load (line 161) | def load(self, llm_model, flow_model, hift_model):
    method inference (line 169) | def inference(self, text, text_len, flow_embedding, llm_embedding=torc...
  class CustomCosyVoice (line 200) | class CustomCosyVoice:
    method __init__ (line 202) | def __init__(self, model_dir):
    method list_avaliable_spks (line 227) | def list_avaliable_spks(self):
    method inference_sft (line 231) | def inference_sft(self, tts_text, spk_id):
    method inference_zero_shot (line 239) | def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k):
    method inference_zero_shot_no_unit_condition_no_normalize (line 248) | def inference_zero_shot_no_unit_condition_no_normalize(self, tts_text,...
    method inference_zero_shot_no_normalize (line 266) | def inference_zero_shot_no_normalize(self, tts_text, prompt_text, prom...
  function transcribe_audio (line 279) | def transcribe_audio(audio_file):
  function get_bopomofo_rare (line 294) | def get_bopomofo_rare(text, converter):
  function parse_transcript (line 327) | def parse_transcript(text, end):
  function single_inference (line 356) | def single_inference(speaker_prompt_audio_path, content_to_synthesize, o...
  function main (line 391) | def main():

FILE: third_party/Matcha-TTS/matcha/app.py
  function MATCHA_TTS_LOC (line 33) | def MATCHA_TTS_LOC(x):
  function VOCODER_LOC (line 37) | def VOCODER_LOC(x):
  function load_model (line 66) | def load_model(model_name, vocoder_name):
  function load_model_ui (line 72) | def load_model_ui(model_type, textbox):
  function process_text_gradio (line 102) | def process_text_gradio(text):
  function synthesise_mel (line 108) | def synthesise_mel(text, text_length, n_timesteps, temperature, length_s...
  function multispeaker_example_cacher (line 125) | def multispeaker_example_cacher(text, n_timesteps, mel_temp, length_scal...
  function ljspeech_example_cacher (line 137) | def ljspeech_example_cacher(text, n_timesteps, mel_temp, length_scale, s...
  function main (line 149) | def main():

FILE: third_party/Matcha-TTS/matcha/cli.py
  function plot_spectrogram_to_numpy (line 37) | def plot_spectrogram_to_numpy(spectrogram, filename):
  function process_text (line 48) | def process_text(i: int, text: str, device: torch.device):
  function get_texts (line 62) | def get_texts(args):
  function assert_required_models_available (line 71) | def assert_required_models_available(args):
  function load_hifigan (line 84) | def load_hifigan(checkpoint_path, device):
  function load_vocoder (line 93) | def load_vocoder(vocoder_name, checkpoint_path, device):
  function load_matcha (line 108) | def load_matcha(model_name, checkpoint_path, device):
  function to_waveform (line 117) | def to_waveform(mel, vocoder, denoiser=None):
  function save_to_folder (line 125) | def save_to_folder(filename: str, output: dict, folder: str):
  function validate_args (line 134) | def validate_args(args):
  function validate_args_for_multispeaker_model (line 163) | def validate_args_for_multispeaker_model(args):
  function validate_args_for_single_speaker_model (line 188) | def validate_args_for_single_speaker_model(args):
  function cli (line 208) | def cli():
  class BatchedSynthesisDataset (line 292) | class BatchedSynthesisDataset(torch.utils.data.Dataset):
    method __init__ (line 293) | def __init__(self, processed_texts):
    method __len__ (line 296) | def __len__(self):
    method __getitem__ (line 299) | def __getitem__(self, idx):
  function batched_collate_fn (line 303) | def batched_collate_fn(batch):
  function batched_synthesis (line 316) | def batched_synthesis(args, device, model, vocoder, denoiser, texts, spk):
  function unbatched_synthesis (line 358) | def unbatched_synthesis(args, device, model, vocoder, denoiser, texts, s...
  function print_config (line 397) | def print_config(args):
  function get_device (line 407) | def get_device(args):

FILE: third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py
  function parse_filelist (line 15) | def parse_filelist(filelist_path, split_char="|"):
  class TextMelDataModule (line 21) | class TextMelDataModule(LightningDataModule):
    method __init__ (line 22) | def __init__(  # pylint: disable=unused-argument
    method setup (line 49) | def setup(self, stage: Optional[str] = None):  # pylint: disable=unuse...
    method train_dataloader (line 88) | def train_dataloader(self):
    method val_dataloader (line 98) | def val_dataloader(self):
    method teardown (line 108) | def teardown(self, stage: Optional[str] = None):
    method state_dict (line 112) | def state_dict(self):  # pylint: disable=no-self-use
    method load_state_dict (line 116) | def load_state_dict(self, state_dict: Dict[str, Any]):
  class TextMelDataset (line 121) | class TextMelDataset(torch.utils.data.Dataset):
    method __init__ (line 122) | def __init__(
    method get_datapoint (line 156) | def get_datapoint(self, filepath_and_text):
    method get_mel (line 172) | def get_mel(self, filepath):
    method get_text (line 189) | def get_text(self, text, add_blank=True):
    method __getitem__ (line 196) | def __getitem__(self, index):
    method __len__ (line 200) | def __len__(self):
  class TextMelBatchCollate (line 204) | class TextMelBatchCollate:
    method __init__ (line 205) | def __init__(self, n_spks):
    method __call__ (line 208) | def __call__(self, batch):

FILE: third_party/Matcha-TTS/matcha/hifigan/denoiser.py
  class Denoiser (line 7) | class Denoiser(torch.nn.Module):
    method __init__ (line 10) | def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_lengt...
    method forward (line 59) | def forward(self, audio, strength=0.0005):

FILE: third_party/Matcha-TTS/matcha/hifigan/env.py
  class AttrDict (line 7) | class AttrDict(dict):
    method __init__ (line 8) | def __init__(self, *args, **kwargs):
  function build_env (line 13) | def build_env(config, config_name, path):

FILE: third_party/Matcha-TTS/matcha/hifigan/meldataset.py
  function load_wav (line 17) | def load_wav(full_path):
  function dynamic_range_compression (line 22) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
  function dynamic_range_decompression (line 26) | def dynamic_range_decompression(x, C=1):
  function dynamic_range_compression_torch (line 30) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
  function dynamic_range_decompression_torch (line 34) | def dynamic_range_decompression_torch(x, C=1):
  function spectral_normalize_torch (line 38) | def spectral_normalize_torch(magnitudes):
  function spectral_de_normalize_torch (line 43) | def spectral_de_normalize_torch(magnitudes):
  function mel_spectrogram (line 52) | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_siz...
  function get_dataset_filelist (line 92) | def get_dataset_filelist(a):
  class MelDataset (line 105) | class MelDataset(torch.utils.data.Dataset):
    method __init__ (line 106) | def __init__(
    method __getitem__ (line 146) | def __getitem__(self, index):
    method __len__ (line 216) | def __len__(self):

FILE: third_party/Matcha-TTS/matcha/hifigan/models.py
  class ResBlock1 (line 14) | class ResBlock1(torch.nn.Module):
    method __init__ (line 15) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
    method forward (line 90) | def forward(self, x):
    method remove_weight_norm (line 99) | def remove_weight_norm(self):
  class ResBlock2 (line 106) | class ResBlock2(torch.nn.Module):
    method __init__ (line 107) | def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
    method forward (line 136) | def forward(self, x):
    method remove_weight_norm (line 143) | def remove_weight_norm(self):
  class Generator (line 148) | class Generator(torch.nn.Module):
    method __init__ (line 149) | def __init__(self, h):
    method forward (line 181) | def forward(self, x):
    method remove_weight_norm (line 199) | def remove_weight_norm(self):
  class DiscriminatorP (line 209) | class DiscriminatorP(torch.nn.Module):
    method __init__ (line 210) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
    method forward (line 225) | def forward(self, x):
  class MultiPeriodDiscriminator (line 247) | class MultiPeriodDiscriminator(torch.nn.Module):
    method __init__ (line 248) | def __init__(self):
    method forward (line 260) | def forward(self, y, y_hat):
  class DiscriminatorS (line 276) | class DiscriminatorS(torch.nn.Module):
    method __init__ (line 277) | def __init__(self, use_spectral_norm=False):
    method forward (line 293) | def forward(self, x):
  class MultiScaleDiscriminator (line 306) | class MultiScaleDiscriminator(torch.nn.Module):
    method __init__ (line 307) | def __init__(self):
    method forward (line 318) | def forward(self, y, y_hat):
  function feature_loss (line 337) | def feature_loss(fmap_r, fmap_g):
  function discriminator_loss (line 346) | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
  function generator_loss (line 360) | def generator_loss(disc_outputs):

FILE: third_party/Matcha-TTS/matcha/hifigan/xutils.py
  function plot_spectrogram (line 14) | def plot_spectrogram(spectrogram):
  function init_weights (line 25) | def init_weights(m, mean=0.0, std=0.01):
  function apply_weight_norm (line 31) | def apply_weight_norm(m):
  function get_padding (line 37) | def get_padding(kernel_size, dilation=1):
  function load_checkpoint (line 41) | def load_checkpoint(filepath, device):
  function save_checkpoint (line 49) | def save_checkpoint(filepath, obj):
  function scan_checkpoint (line 55) | def scan_checkpoint(cp_dir, prefix):

FILE: third_party/Matcha-TTS/matcha/models/baselightningmodule.py
  class BaseLightningClass (line 19) | class BaseLightningClass(LightningModule, ABC):
    method update_data_statistics (line 20) | def update_data_statistics(self, data_statistics):
    method configure_optimizers (line 30) | def configure_optimizers(self) -> Any:
    method get_losses (line 56) | def get_losses(self, batch):
    method on_load_checkpoint (line 75) | def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
    method training_step (line 78) | def training_step(self, batch: Any, batch_idx: int):
    method validation_step (line 127) | def validation_step(self, batch: Any, batch_idx: int):
    method on_validation_end (line 167) | def on_validation_end(self) -> None:
    method on_before_optimizer_step (line 208) | def on_before_optimizer_step(self, optimizer):

FILE: third_party/Matcha-TTS/matcha/models/components/decoder.py
  class SinusoidalPosEmb (line 14) | class SinusoidalPosEmb(torch.nn.Module):
    method __init__ (line 15) | def __init__(self, dim):
    method forward (line 20) | def forward(self, x, scale=1000):
  class Block1D (line 32) | class Block1D(torch.nn.Module):
    method __init__ (line 33) | def __init__(self, dim, dim_out, groups=8):
    method forward (line 41) | def forward(self, x, mask):
  class ResnetBlock1D (line 46) | class ResnetBlock1D(torch.nn.Module):
    method __init__ (line 47) | def __init__(self, dim, dim_out, time_emb_dim, groups=8):
    method forward (line 56) | def forward(self, x, mask, time_emb):
  class Downsample1D (line 64) | class Downsample1D(nn.Module):
    method __init__ (line 65) | def __init__(self, dim):
    method forward (line 69) | def forward(self, x):
  class TimestepEmbedding (line 73) | class TimestepEmbedding(nn.Module):
    method __init__ (line 74) | def __init__(
    method forward (line 105) | def forward(self, sample, condition=None):
  class Upsample1D (line 120) | class Upsample1D(nn.Module):
    method __init__ (line 134) | def __init__(self, channels, use_conv=False, use_conv_transpose=True, ...
    method forward (line 148) | def forward(self, inputs):
  class ConformerWrapper (line 161) | class ConformerWrapper(ConformerBlock):
    method __init__ (line 162) | def __init__(  # pylint: disable=useless-super-delegation
    method forward (line 189) | def forward(
  class Decoder (line 200) | class Decoder(nn.Module):
    method __init__ (line 201) | def __init__(
    method get_block (line 319) | def get_block(block_type, dim, attention_head_dim, num_heads, dropout,...
    method initialize_weights (line 345) | def initialize_weights(self):
    method forward (line 363) | def forward(self, x, mask, mu, t, spks=None, cond=None):

FILE: third_party/Matcha-TTS/matcha/models/components/flow_matching.py
  class BASECFM (line 12) | class BASECFM(torch.nn.Module, ABC):
    method __init__ (line 13) | def __init__(
    method forward (line 33) | def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, c...
    method solve_euler (line 55) | def solve_euler(self, x, t_span, mu, mask, spks, cond):
    method compute_loss (line 87) | def compute_loss(self, x1, mask, mu, spks=None, cond=None):
  class CFM (line 121) | class CFM(BASECFM):
    method __init__ (line 122) | def __init__(self, in_channels, out_channel, cfm_params, decoder_param...

FILE: third_party/Matcha-TTS/matcha/models/components/text_encoder.py
  class LayerNorm (line 15) | class LayerNorm(nn.Module):
    method __init__ (line 16) | def __init__(self, channels, eps=1e-4):
    method forward (line 24) | def forward(self, x):
  class ConvReluNorm (line 36) | class ConvReluNorm(nn.Module):
    method __init__ (line 37) | def __init__(self, in_channels, hidden_channels, out_channels, kernel_...
    method forward (line 60) | def forward(self, x, x_mask):
  class DurationPredictor (line 70) | class DurationPredictor(nn.Module):
    method __init__ (line 71) | def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
    method forward (line 84) | def forward(self, x, x_mask):
  class RotaryPositionalEmbeddings (line 97) | class RotaryPositionalEmbeddings(nn.Module):
    method __init__ (line 107) | def __init__(self, d: int, base: int = 10_000):
    method _build_cache (line 119) | def _build_cache(self, x: torch.Tensor):
    method _neg_half (line 147) | def _neg_half(self, x: torch.Tensor):
    method forward (line 154) | def forward(self, x: torch.Tensor):
  class MultiHeadAttention (line 175) | class MultiHeadAttention(nn.Module):
    method __init__ (line 176) | def __init__(
    method forward (line 216) | def forward(self, x, c, attn_mask=None):
    method attention (line 226) | def attention(self, query, key, value, mask=None):
    method _attention_bias_proximal (line 249) | def _attention_bias_proximal(length):
  class FFN (line 255) | class FFN(nn.Module):
    method __init__ (line 256) | def __init__(self, in_channels, out_channels, filter_channels, kernel_...
    method forward (line 268) | def forward(self, x, x_mask):
  class Encoder (line 276) | class Encoder(nn.Module):
    method __init__ (line 277) | def __init__(
    method forward (line 314) | def forward(self, x, x_mask):
  class TextEncoder (line 328) | class TextEncoder(nn.Module):
    method __init__ (line 329) | def __init__(
    method forward (line 378) | def forward(self, x, x_lengths, spks=None):

FILE: third_party/Matcha-TTS/matcha/models/components/transformer.py
  class SnakeBeta (line 17) | class SnakeBeta(nn.Module):
    method __init__ (line 35) | def __init__(self, in_features, out_features, alpha=1.0, alpha_trainab...
    method forward (line 64) | def forward(self, x):
  class FeedForward (line 83) | class FeedForward(nn.Module):
    method __init__ (line 96) | def __init__(
    method forward (line 131) | def forward(self, hidden_states):
  class BasicTransformerBlock (line 138) | class BasicTransformerBlock(nn.Module):
    method __init__ (line 159) | def __init__(
    method set_chunk_feed_forward (line 238) | def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
    method forward (line 243) | def forward(

FILE: third_party/Matcha-TTS/matcha/models/matcha_tts.py
  class MatchaTTS (line 23) | class MatchaTTS(BaseLightningClass):  # 🍵
    method __init__ (line 24) | def __init__(
    method synthesise (line 74) | def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, spks=...
    method forward (line 150) | def forward(self, x, x_lengths, y, y_lengths, spks=None, out_size=None...

FILE: third_party/Matcha-TTS/matcha/onnx/export.py
  class MatchaWithVocoder (line 22) | class MatchaWithVocoder(LightningModule):
    method __init__ (line 23) | def __init__(self, matcha, vocoder):
    method forward (line 28) | def forward(self, x, x_lengths, scales, spks=None):
  function get_exportable_module (line 35) | def get_exportable_module(matcha, vocoder, n_timesteps):
  function get_inputs (line 63) | def get_inputs(is_multi_speaker):
  function main (line 91) | def main():

FILE: third_party/Matcha-TTS/matcha/onnx/infer.py
  function validate_args (line 15) | def validate_args(args):
  function write_wavs (line 24) | def write_wavs(model, inputs, output_dir, external_vocoder=None):
  function write_mels (line 66) | def write_mels(model, inputs, output_dir):
  function main (line 85) | def main():

FILE: third_party/Matcha-TTS/matcha/text/__init__.py
  function text_to_sequence (line 10) | def text_to_sequence(text, cleaner_names):
  function cleaned_text_to_sequence (line 27) | def cleaned_text_to_sequence(cleaned_text):
  function sequence_to_text (line 38) | def sequence_to_text(sequence):
  function _clean_text (line 47) | def _clean_text(text, cleaner_names):

FILE: third_party/Matcha-TTS/matcha/text/cleaners.py
  function expand_abbreviations (line 66) | def expand_abbreviations(text):
  function lowercase (line 72) | def lowercase(text):
  function collapse_whitespace (line 76) | def collapse_whitespace(text):
  function convert_to_ascii (line 80) | def convert_to_ascii(text):
  function basic_cleaners (line 84) | def basic_cleaners(text):
  function transliteration_cleaners (line 91) | def transliteration_cleaners(text):
  function english_cleaners2 (line 99) | def english_cleaners2(text):
  function english_cleaners_piper (line 109) | def english_cleaners_piper(text):

FILE: third_party/Matcha-TTS/matcha/text/numbers.py
  function _remove_commas (line 16) | def _remove_commas(m):
  function _expand_decimal_point (line 20) | def _expand_decimal_point(m):
  function _expand_dollars (line 24) | def _expand_dollars(m):
  function _expand_ordinal (line 45) | def _expand_ordinal(m):
  function _expand_number (line 49) | def _expand_number(m):
  function normalize_numbers (line 64) | def normalize_numbers(text):

FILE: third_party/Matcha-TTS/matcha/train.py
  function train (line 35) | def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
  function main (line 101) | def main(cfg: DictConfig) -> Optional[float]:

FILE: third_party/Matcha-TTS/matcha/utils/audio.py
  function load_wav (line 10) | def load_wav(full_path):
  function dynamic_range_compression (line 15) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
  function dynamic_range_decompression (line 19) | def dynamic_range_decompression(x, C=1):
  function dynamic_range_compression_torch (line 23) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
  function dynamic_range_decompression_torch (line 27) | def dynamic_range_decompression_torch(x, C=1):
  function spectral_normalize_torch (line 31) | def spectral_normalize_torch(magnitudes):
  function spectral_de_normalize_torch (line 36) | def spectral_de_normalize_torch(magnitudes):
  function mel_spectrogram (line 45) | def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_siz...

FILE: third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py
  function compute_data_statistics (line 25) | def compute_data_statistics(data_loader: torch.utils.data.DataLoader, ou...
  function main (line 50) | def main():

FILE: third_party/Matcha-TTS/matcha/utils/instantiators.py
  function instantiate_callbacks (line 13) | def instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:
  function instantiate_loggers (line 36) | def instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:

FILE: third_party/Matcha-TTS/matcha/utils/logging_utils.py
  function log_hyperparameters (line 12) | def log_hyperparameters(object_dict: Dict[str, Any]) -> None:

FILE: third_party/Matcha-TTS/matcha/utils/model.py
  function sequence_mask (line 7) | def sequence_mask(length, max_length=None):
  function fix_len_compatibility (line 14) | def fix_len_compatibility(length, num_downsamplings_in_unet=2):
  function convert_pad_shape (line 23) | def convert_pad_shape(pad_shape):
  function generate_path (line 29) | def generate_path(duration, mask):
  function duration_loss (line 44) | def duration_loss(logw, logw_, lengths):
  function normalize (line 49) | def normalize(data, mu, std):
  function denormalize (line 71) | def denormalize(data, mu, std):

FILE: third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py
  function maximum_path (line 7) | def maximum_path(value, mask):

FILE: third_party/Matcha-TTS/matcha/utils/monotonic_align/core.c
  function CYTHON_INLINE (line 399) | static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int k, int l,...
  type PyObject (line 484) | typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *co...
  type PyObject (line 485) | typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, ...
  type Py_tss_t (line 526) | typedef int Py_tss_t;
  function CYTHON_INLINE (line 527) | static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
  function CYTHON_INLINE (line 531) | static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
  function CYTHON_INLINE (line 536) | static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
  function CYTHON_INLINE (line 539) | static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
  function CYTHON_INLINE (line 542) | static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
  function CYTHON_INLINE (line 546) | static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
  function CYTHON_INLINE (line 549) | static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
  type Py_hash_t (line 694) | typedef long Py_hash_t;
  type __Pyx_PyAsyncMethodsStruct (line 717) | typedef struct {
  function CYTHON_INLINE (line 733) | static CYTHON_INLINE float __PYX_NAN() {
  type __Pyx_StringTabEntry (line 782) | typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const c...
  function CYTHON_INLINE (line 803) | static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t l...
  function CYTHON_INLINE (line 852) | static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
  function __Pyx_init_sys_getdefaultencoding_params (line 885) | static int __Pyx_init_sys_getdefaultencoding_params(void) {
  function __Pyx_init_sys_getdefaultencoding_params (line 935) | static int __Pyx_init_sys_getdefaultencoding_params(void) {
  function CYTHON_INLINE (line 967) | static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void...
  type __pyx_memoryview_obj (line 1018) | struct __pyx_memoryview_obj
  type __Pyx_memviewslice (line 1019) | typedef struct {
  type __pyx_atomic_int_type (line 1060) | typedef volatile __pyx_atomic_int_type __pyx_atomic_int;
  type __Pyx_StructField_ (line 1080) | struct __Pyx_StructField_
  type __Pyx_TypeInfo (line 1082) | typedef struct {
  type __Pyx_StructField (line 1092) | typedef struct __Pyx_StructField_ {
  type __Pyx_BufFmt_StackElem (line 1097) | typedef struct {
  type __Pyx_BufFmt_Context (line 1101) | typedef struct {
  type npy_int8 (line 1122) | typedef npy_int8 __pyx_t_5numpy_int8_t;
  type npy_int16 (line 1131) | typedef npy_int16 __pyx_t_5numpy_int16_t;
  type npy_int32 (line 1140) | typedef npy_int32 __pyx_t_5numpy_int32_t;
  type npy_int64 (line 1149) | typedef npy_int64 __pyx_t_5numpy_int64_t;
  type npy_uint8 (line 1158) | typedef npy_uint8 __pyx_t_5numpy_uint8_t;
  type npy_uint16 (line 1167) | typedef npy_uint16 __pyx_t_5numpy_uint16_t;
  type npy_uint32 (line 1176) | typedef npy_uint32 __pyx_t_5numpy_uint32_t;
  type npy_uint64 (line 1185) | typedef npy_uint64 __pyx_t_5numpy_uint64_t;
  type npy_float32 (line 1194) | typedef npy_float32 __pyx_t_5numpy_float32_t;
  type npy_float64 (line 1203) | typedef npy_float64 __pyx_t_5numpy_float64_t;
  type npy_long (line 1212) | typedef npy_long __pyx_t_5numpy_int_t;
  type npy_longlong (line 1221) | typedef npy_longlong __pyx_t_5numpy_long_t;
  type npy_longlong (line 1230) | typedef npy_longlong __pyx_t_5numpy_longlong_t;
  type npy_ulong (line 1239) | typedef npy_ulong __pyx_t_5numpy_uint_t;
  type npy_ulonglong (line 1248) | typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
  type npy_ulonglong (line 1257) | typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
  type npy_intp (line 1266) | typedef npy_intp __pyx_t_5numpy_intp_t;
  type npy_uintp (line 1275) | typedef npy_uintp __pyx_t_5numpy_uintp_t;
  type npy_double (line 1284) | typedef npy_double __pyx_t_5numpy_float_t;
  type npy_double (line 1293) | typedef npy_double __pyx_t_5numpy_double_t;
  type npy_longdouble (line 1302) | typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
  type std (line 1306) | typedef ::std::complex< float > __pyx_t_float_complex;
  type __pyx_t_float_complex (line 1308) | typedef float _Complex __pyx_t_float_complex;
  type __pyx_t_float_complex (line 1311) | typedef struct { float real, imag; } __pyx_t_float_complex;
  type std (line 1318) | typedef ::std::complex< double > __pyx_t_double_complex;
  type __pyx_t_double_complex (line 1320) | typedef double _Complex __pyx_t_double_complex;
  type __pyx_t_double_complex (line 1323) | typedef struct { double real, imag; } __pyx_t_double_complex;
  type __pyx_array_obj (line 1329) | struct __pyx_array_obj
  type __pyx_MemviewEnum_obj (line 1330) | struct __pyx_MemviewEnum_obj
  type __pyx_memoryview_obj (line 1331) | struct __pyx_memoryview_obj
  type __pyx_memoryviewslice_obj (line 1332) | struct __pyx_memoryviewslice_obj
  type npy_cfloat (line 1341) | typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
  type npy_cdouble (line 1350) | typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
  type npy_clongdouble (line 1359) | typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
  type npy_cdouble (line 1368) | typedef npy_cdouble __pyx_t_5numpy_complex_t;
  type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 1369) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c
  type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 1378) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_pat...
  type __pyx_array_obj (line 1390) | struct __pyx_array_obj {
  type __pyx_MemviewEnum_obj (line 1415) | struct __pyx_MemviewEnum_obj {
  type __pyx_memoryview_obj (line 1428) | struct __pyx_memoryview_obj {
  type __pyx_memoryviewslice_obj (line 1451) | struct __pyx_memoryviewslice_obj {
  type __pyx_vtabstruct_array (line 1469) | struct __pyx_vtabstruct_array {
  type __pyx_vtabstruct_array (line 1472) | struct __pyx_vtabstruct_array
  type __pyx_vtabstruct_memoryview (line 1483) | struct __pyx_vtabstruct_memoryview {
  type __pyx_vtabstruct_memoryview (line 1492) | struct __pyx_vtabstruct_memoryview
  type __pyx_vtabstruct__memoryviewslice (line 1503) | struct __pyx_vtabstruct__memoryviewslice {
  type __pyx_vtabstruct__memoryviewslice (line 1506) | struct __pyx_vtabstruct__memoryviewslice
  type __Pyx_RefNannyAPIStruct (line 1514) | typedef struct {
  type __pyx_memoryview_obj (line 1593) | struct __pyx_memoryview_obj
  type __pyx_array_obj (line 1780) | struct __pyx_array_obj
  function CYTHON_INLINE (line 1814) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16(const char *s...
  function CYTHON_INLINE (line 1818) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16LE(const char ...
  function CYTHON_INLINE (line 1822) | static CYTHON_INLINE PyObject *__Pyx_PyUnicode_DecodeUTF16BE(const char ...
  function CYTHON_INLINE (line 1922) | static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject*...
  function CYTHON_INLINE (line 1946) | static CYTHON_INLINE int __Pyx_PyList_Extend(PyObject* L, PyObject* v) {
  function CYTHON_INLINE (line 1960) | static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
  function CYTHON_INLINE (line 1979) | static CYTHON_INLINE int __Pyx_PySequence_ContainsTF(PyObject* item, PyO...
  type __Pyx_ImportType_CheckSize_0_29_35 (line 2024) | enum __Pyx_ImportType_CheckSize_0_29_35 {
  type __Pyx_ImportType_CheckSize_0_29_35 (line 2029) | enum __Pyx_ImportType_CheckSize_0_29_35
  type __Pyx_CodeObjectCacheEntry (line 2040) | typedef struct {
  type __Pyx_CodeObjectCache (line 2044) | struct __Pyx_CodeObjectCache {
  type __Pyx_CodeObjectCache (line 2049) | struct __Pyx_CodeObjectCache
  type __Pyx_Buf_DimInfo (line 2068) | typedef struct {
  type __Pyx_Buffer (line 2071) | typedef struct {
  type __Pyx_LocalBuf_ND (line 2075) | typedef struct {
  type __pyx_array_obj (line 2255) | struct __pyx_array_obj
  type __pyx_memoryview_obj (line 2256) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2257) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2258) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2259) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2259) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2260) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2261) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2262) | struct __pyx_memoryview_obj
  type __pyx_memoryviewslice_obj (line 2263) | struct __pyx_memoryviewslice_obj
  type __pyx_memoryviewslice_obj (line 2264) | struct __pyx_memoryviewslice_obj
  type __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 2321) | struct __pyx_opt_args_6matcha_5utils_15monotonic_align_4core_maximum_path_c
  type __pyx_array_obj (line 2322) | struct __pyx_array_obj
  type __pyx_memoryview_obj (line 2328) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2328) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2333) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2334) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2335) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2336) | struct __pyx_memoryview_obj
  type __pyx_MemviewEnum_obj (line 2354) | struct __pyx_MemviewEnum_obj
  type __pyx_array_obj (line 2562) | struct __pyx_array_obj
  type __pyx_array_obj (line 2563) | struct __pyx_array_obj
  type __pyx_array_obj (line 2564) | struct __pyx_array_obj
  type __pyx_array_obj (line 2565) | struct __pyx_array_obj
  type __pyx_array_obj (line 2566) | struct __pyx_array_obj
  type __pyx_array_obj (line 2567) | struct __pyx_array_obj
  type __pyx_array_obj (line 2568) | struct __pyx_array_obj
  type __pyx_array_obj (line 2569) | struct __pyx_array_obj
  type __pyx_MemviewEnum_obj (line 2572) | struct __pyx_MemviewEnum_obj
  type __pyx_MemviewEnum_obj (line 2573) | struct __pyx_MemviewEnum_obj
  type __pyx_MemviewEnum_obj (line 2574) | struct __pyx_MemviewEnum_obj
  type __pyx_MemviewEnum_obj (line 2575) | struct __pyx_MemviewEnum_obj
  type __pyx_memoryview_obj (line 2576) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2577) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2578) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2579) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2580) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2581) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2582) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2583) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2584) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2585) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2586) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2587) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2588) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2589) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2590) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2591) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2592) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2593) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2594) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2595) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 2596) | struct __pyx_memoryview_obj
  type __pyx_memoryviewslice_obj (line 2599) | struct __pyx_memoryviewslice_obj
  type __pyx_memoryviewslice_obj (line 2600) | struct __pyx_memoryviewslice_obj
  function __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_each (line 2653) | static void __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_...
  function __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_c (line 2951) | static void __pyx_f_6matcha_5utils_15monotonic_align_4core_maximum_path_...
  function PyObject (line 3105) | static PyObject *__pyx_pw_6matcha_5utils_15monotonic_align_4core_1maximu...
  function PyObject (line 3207) | static PyObject *__pyx_pf_6matcha_5utils_15monotonic_align_4core_maximum...
  function CYTHON_INLINE (line 3253) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyOb...
  function CYTHON_INLINE (line 3303) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyOb...
  function CYTHON_INLINE (line 3353) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyOb...
  function CYTHON_INLINE (line 3403) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyOb...
  function CYTHON_INLINE (line 3453) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyOb...
  function CYTHON_INLINE (line 3503) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_D...
  function CYTHON_INLINE (line 3577) | static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *_...
  function CYTHON_INLINE (line 3619) | static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObje...
  function CYTHON_INLINE (line 3700) | static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
  function CYTHON_INLINE (line 3832) | static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
  function CYTHON_INLINE (line 3964) | static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
  function CYTHON_INLINE (line 4096) | static CYTHON_INLINE int __pyx_f_5numpy_is_timedelta64_object(PyObject *...
  function CYTHON_INLINE (line 4133) | static CYTHON_INLINE int __pyx_f_5numpy_is_datetime64_object(PyObject *_...
  function CYTHON_INLINE (line 4170) | static CYTHON_INLINE npy_datetime __pyx_f_5numpy_get_datetime64_value(Py...
  function CYTHON_INLINE (line 4204) | static CYTHON_INLINE npy_timedelta __pyx_f_5numpy_get_timedelta64_value(...
  function CYTHON_INLINE (line 4238) | static CYTHON_INLINE NPY_DATETIMEUNIT __pyx_f_5numpy_get_datetime64_unit...
  function __pyx_array___cinit__ (line 4272) | static int __pyx_array___cinit__(PyObject *__pyx_v_self, PyObject *__pyx...
  function __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__ (line 4400) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array___cinit__(s...
  function CYTHON_UNUSED (line 5023) | static CYTHON_UNUSED int __pyx_array_getbuffer(PyObject *__pyx_v_self, P...
  function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffer__ (line 5034) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_2__getbuffe...
  function __pyx_array___dealloc__ (line 5330) | static void __pyx_array___dealloc__(PyObject *__pyx_v_self) {
  function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc__ (line 5339) | static void __pyx_array___pyx_pf_15View_dot_MemoryView_5array_4__dealloc...
  function PyObject (line 5461) | static PyObject *__pyx_pw_15View_dot_MemoryView_5array_7memview_1__get__...
  function PyObject (line 5472) | static PyObject *__pyx_pf_15View_dot_MemoryView_5array_7memview___get__(...
  function PyObject (line 5522) | static PyObject *__pyx_array_get_memview(struct __pyx_array_obj *__pyx_v...
  function Py_ssize_t (line 5604) | static Py_ssize_t __pyx_array___len__(PyObject *__pyx_v_self) {
  function Py_ssize_t (line 5615) | static Py_ssize_t __pyx_array___pyx_pf_15View_dot_MemoryView_5array_6__l...
  function PyObject (line 5654) | static PyObject *__pyx_array___getattr__(PyObject *__pyx_v_self, PyObjec...
  function PyObject (line 5665) | static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_8__ge...
  function PyObject (line 5722) | static PyObject *__pyx_array___getitem__(PyObject *__pyx_v_self, PyObjec...
  function PyObject (line 5733) | static PyObject *__pyx_array___pyx_pf_15View_dot_MemoryView_5array_10__g...
  function __pyx_array___setitem__ (line 5790) | static int __pyx_array___setitem__(PyObject *__pyx_v_self, PyObject *__p...
  function __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem__ (line 5801) | static int __pyx_array___pyx_pf_15View_dot_MemoryView_5array_12__setitem...
  function PyObject (line 5850) | static PyObject *__pyx_pw___pyx_array_1__reduce_cython__(PyObject *__pyx...
  function PyObject (line 5861) | static PyObject *__pyx_pf___pyx_array___reduce_cython__(CYTHON_UNUSED st...
  function PyObject (line 5907) | static PyObject *__pyx_pw___pyx_array_3__setstate_cython__(PyObject *__p...
  function PyObject (line 5918) | static PyObject *__pyx_pf___pyx_array_2__setstate_cython__(CYTHON_UNUSED...
  type __pyx_array_obj (line 5963) | struct __pyx_array_obj
  type __pyx_array_obj (line 5964) | struct __pyx_array_obj
  type __pyx_array_obj (line 5965) | struct __pyx_array_obj
  type __pyx_array_obj (line 6017) | struct __pyx_array_obj
  type __pyx_array_obj (line 6081) | struct __pyx_array_obj
  function __pyx_MemviewEnum___init__ (line 6140) | static int __pyx_MemviewEnum___init__(PyObject *__pyx_v_self, PyObject *...
  function __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init__ (line 6191) | static int __pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum___init...
  function PyObject (line 6233) | static PyObject *__pyx_MemviewEnum___repr__(PyObject *__pyx_v_self) {
  function PyObject (line 6244) | static PyObject *__pyx_MemviewEnum___pyx_pf_15View_dot_MemoryView_4Enum_...
  function PyObject (line 6284) | static PyObject *__pyx_pw___pyx_MemviewEnum_1__reduce_cython__(PyObject ...
  function PyObject (line 6295) | static PyObject *__pyx_pf___pyx_MemviewEnum___reduce_cython__(struct __p...
  function PyObject (line 6519) | static PyObject *__pyx_pw___pyx_MemviewEnum_3__setstate_cython__(PyObjec...
  function PyObject (line 6530) | static PyObject *__pyx_pf___pyx_MemviewEnum_2__setstate_cython__(struct ...
  function __pyx_memoryview___cinit__ (line 6662) | static int __pyx_memoryview___cinit__(PyObject *__pyx_v_self, PyObject *...
  function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview___cinit__ (line 6742) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
  function __pyx_memoryview___dealloc__ (line 7060) | static void __pyx_memoryview___dealloc__(PyObject *__pyx_v_self) {
  function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_2__dealloc__ (line 7069) | static void __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview...
  type __pyx_memoryview_obj (line 7289) | struct __pyx_memoryview_obj
  function PyObject (line 7429) | static PyObject *__pyx_memoryview___getitem__(PyObject *__pyx_v_self, Py...
  function PyObject (line 7440) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function __pyx_memoryview___setitem__ (line 7618) | static int __pyx_memoryview___setitem__(PyObject *__pyx_v_self, PyObject...
  function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_6__setitem__ (line 7629) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
  function PyObject (line 7844) | static PyObject *__pyx_memoryview_is_slice(struct __pyx_memoryview_obj *...
  function PyObject (line 8054) | static PyObject *__pyx_memoryview_setitem_slice_assignment(struct __pyx_...
  function PyObject (line 8144) | static PyObject *__pyx_memoryview_setitem_slice_assign_scalar(struct __p...
  function PyObject (line 8434) | static PyObject *__pyx_memoryview_setitem_indexed(struct __pyx_memoryvie...
  function PyObject (line 8495) | static PyObject *__pyx_memoryview_convert_item_to_object(struct __pyx_me...
  function PyObject (line 8772) | static PyObject *__pyx_memoryview_assign_item_from_object(struct __pyx_m...
  function CYTHON_UNUSED (line 9013) | static CYTHON_UNUSED int __pyx_memoryview_getbuffer(PyObject *__pyx_v_se...
  function __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_8__getbuffer__ (line 9024) | static int __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memoryview_...
  function PyObject (line 9357) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_1T_1__get__...
  function PyObject (line 9368) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_1T___get__(...
  function PyObject (line 9443) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4base_1__ge...
  function PyObject (line 9454) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4base___get...
  function PyObject (line 9496) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_5shape_1__g...
  function PyObject (line 9507) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_5shape___ge...
  function PyObject (line 9577) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_7strides_1_...
  function PyObject (line 9588) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_7strides___...
  function PyObject (line 9691) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_10suboffset...
  function PyObject (line 9702) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_10suboffset...
  function PyObject (line 9809) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4ndim_1__ge...
  function PyObject (line 9820) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4ndim___get...
  function PyObject (line 9872) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_8itemsize_1...
  function PyObject (line 9883) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_8itemsize__...
  function PyObject (line 9935) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_6nbytes_1__...
  function PyObject (line 9946) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_6nbytes___g...
  function PyObject (line 10008) | static PyObject *__pyx_pw_15View_dot_MemoryView_10memoryview_4size_1__ge...
  function PyObject (line 10019) | static PyObject *__pyx_pf_15View_dot_MemoryView_10memoryview_4size___get...
  function Py_ssize_t (line 10149) | static Py_ssize_t __pyx_memoryview___len__(PyObject *__pyx_v_self) {
  function Py_ssize_t (line 10160) | static Py_ssize_t __pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memo...
  function PyObject (line 10229) | static PyObject *__pyx_memoryview___repr__(PyObject *__pyx_v_self) {
  function PyObject (line 10240) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10331) | static PyObject *__pyx_memoryview___str__(PyObject *__pyx_v_self) {
  function PyObject (line 10342) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10410) | static PyObject *__pyx_memoryview_is_c_contig(PyObject *__pyx_v_self, CY...
  function PyObject (line 10421) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10486) | static PyObject *__pyx_memoryview_is_f_contig(PyObject *__pyx_v_self, CY...
  function PyObject (line 10497) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10562) | static PyObject *__pyx_memoryview_copy(PyObject *__pyx_v_self, CYTHON_UN...
  function PyObject (line 10573) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10656) | static PyObject *__pyx_memoryview_copy_fortran(PyObject *__pyx_v_self, C...
  function PyObject (line 10667) | static PyObject *__pyx_memoryview___pyx_pf_15View_dot_MemoryView_10memor...
  function PyObject (line 10749) | static PyObject *__pyx_pw___pyx_memoryview_1__reduce_cython__(PyObject *...
  function PyObject (line 10760) | static PyObject *__pyx_pf___pyx_memoryview___reduce_cython__(CYTHON_UNUS...
  function PyObject (line 10806) | static PyObject *__pyx_pw___pyx_memoryview_3__setstate_cython__(PyObject...
  function PyObject (line 10817) | static PyObject *__pyx_pf___pyx_memoryview_2__setstate_cython__(CYTHON_U...
  function PyObject (line 10862) | static PyObject *__pyx_memoryview_new(PyObject *__pyx_v_o, int __pyx_v_f...
  function CYTHON_INLINE (line 10953) | static CYTHON_INLINE int __pyx_memoryview_check(PyObject *__pyx_v_o) {
  function PyObject (line 10992) | static PyObject *_unellipsify(PyObject *__pyx_v_index, int __pyx_v_ndim) {
  function PyObject (line 11449) | static PyObject *assert_direct_dimensions(Py_ssize_t *__pyx_v_suboffsets...
  type __pyx_memoryview_obj (line 11537) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 11537) | struct __pyx_memoryview_obj
  type __pyx_memoryviewslice_obj (line 11544) | struct __pyx_memoryviewslice_obj
  type __pyx_memoryview_obj (line 11554) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 11559) | struct __pyx_memoryview_obj
  type __pyx_memoryviewslice_obj (line 11629) | struct __pyx_memoryviewslice_obj
  type __pyx_memoryview_obj (line 12041) | struct __pyx_memoryview_obj
  type __pyx_memoryview_obj (line 12082) | struct __pyx_memoryview_obj
  function __pyx_memoryview_slice_memviewslice (line 12117) | static int __pyx_memoryview_slice_memviewslice(__Pyx_memviewslice *__pyx...
  function __pyx_memslice_transpose (line 13210) | static int __pyx_memslice_transpose(__Pyx_memviewslice *__pyx_v_memslice) {
  function __pyx_memoryviewslice___dealloc__ (line 13386) | static void __pyx_memoryviewslice___dealloc__(PyObject *__pyx_v_self) {
  function __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memoryviewslice___dealloc__ (line 13395) | static void __pyx_memoryviewslice___pyx_pf_15View_dot_MemoryView_16_memo...
  function PyObject (line 13428) | static PyObject *__pyx_memoryviewslice_convert_item_to_object(struct __p...
  function PyObject (line 13514) | static PyObject *__pyx_memoryviewslice_assign_item_from_object(struct __...
  function PyObject (line 13599) | static PyObject *__pyx_pw_15View_dot_MemoryView_16_memoryviewslice_4base...
  function PyObject (line 13610) | static PyObject *__pyx_pf_15View_dot_MemoryView_16_memoryviewslice_4base...
  function PyObject (line 13650) | static PyObject *__pyx_pw___pyx_memoryviewslice_1__reduce_cython__(PyObj...
  function PyObject (line 13661) | static PyObject *__pyx_pf___pyx_memoryviewslice___reduce_cython__(CYTHON...
  function PyObject (line 13707) | static PyObject *__pyx_pw___pyx_memoryviewslice_3__setstate_cython__(PyO...
  function PyObject (line 13718) | static PyObject *__pyx_pf___pyx_memoryviewslice_2__setstate_cython__(CYT...
  function PyObject (line 13763) | static PyObject *__pyx_memoryview_fromslice(__Pyx_memviewslice __pyx_v_m...
  function __Pyx_memviewslice (line 14149) | static __Pyx_memviewslice *__pyx_memoryview_get_slice_from_memoryview(st...
  function __pyx_memoryview_slice_copy (line 14252) | static void __pyx_memoryview_slice_copy(struct __pyx_memoryview_obj *__p...
  function PyObject (line 14378) | static PyObject *__pyx_memoryview_copy_object(struct __pyx_memoryview_ob...
  function PyObject (line 14438) | static PyObject *__pyx_memoryview_copy_object_from_slice(struct __pyx_me...
  function Py_ssize_t (line 14564) | static Py_ssize_t abs_py_ssize_t(Py_ssize_t __pyx_v_arg) {
  function __pyx_get_best_slice_order (line 14630) | static char __pyx_get_best_slice_order(__Pyx_memviewslice *__pyx_v_mslic...
  function _copy_strided_to_strided (line 14820) | static void _copy_strided_to_strided(char *__pyx_v_src_data, Py_ssize_t ...
  function copy_strided_to_strided (line 15057) | static void copy_strided_to_strided(__Pyx_memviewslice *__pyx_v_src, __P...
  function Py_ssize_t (line 15087) | static Py_ssize_t __pyx_memoryview_slice_get_size(__Pyx_memviewslice *__...
  function Py_ssize_t (line 15159) | static Py_ssize_t __pyx_fill_contig_strides_array(Py_ssize_t *__pyx_v_sh...
  type __pyx_memoryview_obj (line 15290) | struct __pyx_memoryview_obj
  function __pyx_memoryview_err_extents (line 15536) | static int __pyx_memoryview_err_extents(int __pyx_v_i, Py_ssize_t __pyx_...
  function __pyx_memoryview_err_dim (line 15624) | static int __pyx_memoryview_err_dim(PyObject *__pyx_v_error, char *__pyx...
  function __pyx_memoryview_err (line 15708) | static int __pyx_memoryview_err(PyObject *__pyx_v_error, char *__pyx_v_m...
  function __pyx_memoryview_copy_contents (line 15818) | static int __pyx_memoryview_copy_contents(__Pyx_memviewslice __pyx_v_src...
  function __pyx_memoryview_broadcast_leading (line 16397) | static void __pyx_memoryview_broadcast_leading(__Pyx_memviewslice *__pyx...
  function __pyx_memoryview_refcount_copying (line 16510) | static void __pyx_memoryview_refcount_copying(__Pyx_memviewslice *__pyx_...
  function __pyx_memoryview_refcount_objects_in_slice_with_gil (line 16560) | static void __pyx_memoryview_refcount_objects_in_slice_with_gil(char *__...
  function __pyx_memoryview_refcount_objects_in_slice (line 16599) | static void __pyx_memoryview_refcount_objects_in_slice(char *__pyx_v_dat...
  function __pyx_memoryview_slice_assign_scalar (line 16731) | static void __pyx_memoryview_slice_assign_scalar(__Pyx_memviewslice *__p...
  function __pyx_memoryview__slice_assign_scalar (line 16779) | static void __pyx_memoryview__slice_assign_scalar(char *__pyx_v_data, Py...
  function PyObject (line 16911) | static PyObject *__pyx_pw_15View_dot_MemoryView_1__pyx_unpickle_Enum(PyO...
  function PyObject (line 16984) | static PyObject *__pyx_pf_15View_dot_MemoryView___pyx_unpickle_Enum(CYTH...
  function PyObject (line 17179) | static PyObject *__pyx_unpickle_Enum__set_state(struct __pyx_MemviewEnum...
  type __pyx_vtabstruct_array (line 17302) | struct __pyx_vtabstruct_array
  function PyObject (line 17304) | static PyObject *__pyx_tp_new_array(PyTypeObject *t, PyObject *a, PyObje...
  function __pyx_tp_dealloc_array (line 17324) | static void __pyx_tp_dealloc_array(PyObject *o) {
  function PyObject (line 17343) | static PyObject *__pyx_sq_item_array(PyObject *o, Py_ssize_t i) {
  function __pyx_mp_ass_subscript_array (line 17351) | static int __pyx_mp_ass_subscript_array(PyObject *o, PyObject *i, PyObje...
  function PyObject (line 17362) | static PyObject *__pyx_tp_getattro_array(PyObject *o, PyObject *n) {
  function PyObject (line 17371) | static PyObject *__pyx_getprop___pyx_array_memview(PyObject *o, CYTHON_U...
  type PyGetSetDef (line 17382) | struct PyGetSetDef
  type __pyx_array_obj (line 17426) | struct __pyx_array_obj
  function PyObject (line 17495) | static PyObject *__pyx_tp_new_Enum(PyTypeObject *t, CYTHON_UNUSED PyObje...
  function __pyx_tp_dealloc_Enum (line 17509) | static void __pyx_tp_dealloc_Enum(PyObject *o) {
  function __pyx_tp_traverse_Enum (line 17521) | static int __pyx_tp_traverse_Enum(PyObject *o, visitproc v, void *a) {
  function __pyx_tp_clear_Enum (line 17530) | static int __pyx_tp_clear_Enum(PyObject *o) {
  type __pyx_MemviewEnum_obj (line 17548) | struct __pyx_MemviewEnum_obj
  type __pyx_vtabstruct_memoryview (line 17616) | struct __pyx_vtabstruct_memoryview
  function PyObject (line 17618) | static PyObject *__pyx_tp_new_memoryview(PyTypeObject *t, PyObject *a, P...
  function __pyx_tp_dealloc_memoryview (line 17640) | static void __pyx_tp_dealloc_memoryview(PyObject *o) {
  function __pyx_tp_traverse_memoryview (line 17662) | static int __pyx_tp_traverse_memoryview(PyObject *o, visitproc v, void *...
  function __pyx_tp_clear_memoryview (line 17680) | static int __pyx_tp_clear_memoryview(PyObject *o) {
  function PyObject (line 17695) | static PyObject *__pyx_sq_item_memoryview(PyObject *o, Py_ssize_t i) {
  function __pyx_mp_ass_subscript_memoryview (line 17703) | static int __pyx_mp_ass_subscript_memoryview(PyObject *o, PyObject *i, P...
  function PyObject (line 17714) | static PyObject *__pyx_getprop___pyx_memoryview_T(PyObject *o, CYTHON_UN...
  function PyObject (line 17718) | static PyObject *__pyx_getprop___pyx_memoryview_base(PyObject *o, CYTHON...
  function PyObject (line 17722) | static PyObject *__pyx_getprop___pyx_memoryview_shape(PyObject *o, CYTHO...
  function PyObject (line 17726) | static PyObject *__pyx_getprop___pyx_memoryview_strides(PyObject *o, CYT...
  function PyObject (line 17730) | static PyObject *__pyx_getprop___pyx_memoryview_suboffsets(PyObject *o, ...
  function PyObject (line 17734) | static PyObject *__pyx_getprop___pyx_memoryview_ndim(PyObject *o, CYTHON...
  function PyObject (line 17738) | static PyObject *__pyx_getprop___pyx_memoryview_itemsize(PyObject *o, CY...
  function PyObject (line 17742) | static PyObject *__pyx_getprop___pyx_memoryview_nbytes(PyObject *o, CYTH...
  function PyObject (line 17746) | static PyObject *__pyx_getprop___pyx_memoryview_size(PyObject *o, CYTHON...
  type PyGetSetDef (line 17760) | struct PyGetSetDef
  type __pyx_memoryview_obj (line 17812) | struct __pyx_memoryview_obj
  type __pyx_vtabstruct__memoryviewslice (line 17880) | struct __pyx_vtabstruct__memoryviewslice
  function PyObject (line 17882) | static PyObject *__pyx_tp_new__memoryviewslice(PyTypeObject *t, PyObject...
  function __pyx_tp_dealloc__memoryviewslice (line 17893) | static void __pyx_tp_dealloc__memoryviewslice(PyObject *o) {
  function __pyx_tp_traverse__memoryviewslice (line 17914) | static int __pyx_tp_traverse__memoryviewslice(PyObject *o, visitproc v, ...
  function __pyx_tp_clear__memoryviewslice (line 17924) | static int __pyx_tp_clear__memoryviewslice(PyObject *o) {
  function PyObject (line 17935) | static PyObject *__pyx_getprop___pyx_memoryviewslice_base(PyObject *o, C...
  type PyGetSetDef (line 17945) | struct PyGetSetDef
  type __pyx_memoryviewslice_obj (line 17953) | struct __pyx_memoryviewslice_obj
  type PyModuleDef (line 18046) | struct PyModuleDef
  function CYTHON_SMALL_CODE (line 18175) | static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
  function CYTHON_SMALL_CODE (line 18190) | static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
  function CYTHON_SMALL_CODE (line 18482) | static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
  function __Pyx_modinit_global_init_code (line 18510) | static int __Pyx_modinit_global_init_code(void) {
  function __Pyx_modinit_variable_export_code (line 18523) | static int __Pyx_modinit_variable_export_code(void) {
  function __Pyx_modinit_function_export_code (line 18531) | static int __Pyx_modinit_function_export_code(void) {
  function __Pyx_modinit_type_init_code (line 18539) | static int __Pyx_modinit_type_init_code(void) {
  function __Pyx_modinit_type_import_code (line 18604) | static int __Pyx_modinit_type_import_code(void) {
  function __Pyx_modinit_variable_import_code (line 18648) | static int __Pyx_modinit_variable_import_code(void) {
  function __Pyx_modinit_function_import_code (line 18656) | static int __Pyx_modinit_function_import_code(void) {
  function __Pyx_PyMODINIT_FUNC (line 18687) | __Pyx_PyMODINIT_FUNC PyInit_core(void)
  function CYTHON_SMALL_CODE (line 18692) | static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
  function CYTHON_SMALL_CODE (line 18715) | static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, P...
  function CYTHON_SMALL_CODE (line 18730) | static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CY...
  function __Pyx_RefNannyAPIStruct (line 19075) | static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modn...
  function CYTHON_INLINE (line 19092) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, ...
  function PyObject (line 19105) | static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
  function __Pyx_init_memviewslice (line 19119) | static int
  function __pyx_fatalerror (line 19171) | static void __pyx_fatalerror(const char *fmt, ...) Py_NO_RETURN {
  function CYTHON_INLINE (line 19183) | static CYTHON_INLINE int
  function CYTHON_INLINE (line 19193) | static CYTHON_INLINE int
  function CYTHON_INLINE (line 19203) | static CYTHON_INLINE void
  function CYTHON_INLINE (line 19224) | static CYTHON_INLINE void __Pyx_XDEC_MEMVIEW(__Pyx_memviewslice *memslice,
  function __Pyx_RaiseArgtupleInvalid (line 19251) | static void __Pyx_RaiseArgtupleInvalid(
  function __Pyx_RaiseDoubleKeywordsError (line 19277) | static void __Pyx_RaiseDoubleKeywordsError(
  function __Pyx_ParseOptionalKeywords (line 19291) | static int __Pyx_ParseOptionalKeywords(
  function CYTHON_INLINE (line 19393) | static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varna...
  function _PyErr_StackItem (line 19399) | static _PyErr_StackItem *
  function CYTHON_INLINE (line 19414) | static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, Py...
  function CYTHON_INLINE (line 19429) | static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, P...
  function __Pyx_PyErr_ExceptionMatchesTuple (line 19455) | static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObjec...
  function CYTHON_INLINE (line 19468) | static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadSta...
  function __Pyx_GetException (line 19482) | static int __Pyx_GetException(PyObject **type, PyObject **value, PyObjec...
  function CYTHON_INLINE (line 19554) | static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObj...
  function CYTHON_INLINE (line 19574) | static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate,...
  function CYTHON_INLINE (line 19586) | static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, P...
  function __Pyx_Raise (line 19598) | static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
  function __Pyx_Raise (line 19649) | static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, P...
  function __Pyx__ArgTypeTest (line 19756) | static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const c...
  function CYTHON_INLINE (line 19778) | static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *fun...
  function PyObject (line 19801) | static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObjec...
  function CYTHON_UNUSED (line 19919) | static CYTHON_UNUSED PyObject* __Pyx_PyObject_Call2Args(PyObject* functi...
  function CYTHON_INLINE (line 19949) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, ...
  function PyObject (line 19969) | static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *ar...
  function CYTHON_INLINE (line 19979) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func,...
  function CYTHON_INLINE (line 19997) | static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func,...
  function CYTHON_INLINE (line 20008) | static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2...
  function CYTHON_INLINE (line 20055) | static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* ...
  function CYTHON_INLINE (line 20157) | static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ss...
  function CYTHON_INLINE (line 20165) | static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) {
  function PyObject (line 20178) | static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
  function CYTHON_INLINE (line 20185) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, P...
  function CYTHON_INLINE (line 20203) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, ...
  function CYTHON_INLINE (line 20221) | static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssi...
  function PyObject (line 20266) | static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject* index) {
  function PyObject (line 20284) | static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key) {
  function CYTHON_INLINE (line 20294) | static CYTHON_INLINE PyObject* __Pyx_decode_c_string(
  function PyObject (line 20327) | static PyObject *__Pyx_GetAttr3Default(PyObject *d) {
  function CYTHON_INLINE (line 20336) | static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, ...
  function CYTHON_INLINE (line 20343) | static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
  function CYTHON_INLINE (line 20347) | static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject ...
  function CYTHON_INLINE (line 20359) | static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj...
  function CYTHON_INLINE (line 20371) | static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name)
  function CYTHON_INLINE (line 20403) | static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expec...
  function CYTHON_INLINE (line 20409) | static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t inde...
  function CYTHON_INLINE (line 20416) | static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
  function CYTHON_INLINE (line 20421) | static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *typ...
  function CYTHON_INLINE (line 20435) | static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, Py...
  function CYTHON_INLINE (line 20458) | static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject ...
  function PyObject (line 20469) | static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int l...
  function __Pyx_InBases (line 20535) | static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
  function CYTHON_INLINE (line 20543) | static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *...
  function __Pyx_inner_PyErr_GivenExceptionMatches2 (line 20559) | static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObj...
  function CYTHON_INLINE (line 20581) | static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObje...
  function __Pyx_PyErr_GivenExceptionMatchesTuple (line 20589) | static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, Py...
  function CYTHON_INLINE (line 20610) | static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err...
  function CYTHON_INLINE (line 20622) | static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *er...
  function PyObject (line 20635) | static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHO...
  function __Pyx_div_long (line 20758) | static CYTHON_INLINE long __Pyx_div_long(long a, long b) {
  function PyObject (line 20766) | static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
  function CYTHON_INLINE (line 20780) | static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) {
  function PyObject (line 20799) | static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, P...
  function CYTHON_INLINE (line 20810) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObj...
  function PyObject (line 20839) | static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* ...
  function __Pyx_SetVtable (line 20848) | static int __Pyx_SetVtable(PyObject *dict, void *vtable) {
  function __Pyx_PyObject_GetAttrStr_ClearAttributeError (line 20866) | static void __Pyx_PyObject_GetAttrStr_ClearAttributeError(void) {
  function CYTHON_INLINE (line 20872) | static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStrNoError(PyObject...
  function __Pyx_setup_reduce_is_named (line 20888) | static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) {
  function __Pyx_setup_reduce (line 20904) | static int __Pyx_setup_reduce(PyObject* type_obj) {
  function PyTypeObject (line 20994) | static PyTypeObject *__Pyx_ImportType_0_29_35(PyObject *module, const ch...
  function __Pyx_CLineForTraceback (line 21072) | static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, ...
  function __pyx_bisect_code_objects (line 21113) | static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries...
  function PyCodeObject (line 21134) | static PyCodeObject *__pyx_find_code_object(int code_line) {
  function __pyx_insert_code_object (line 21148) | static void __pyx_insert_code_object(int code_line, PyCodeObject* code_o...
  function PyCodeObject (line 21202) | static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
  function __Pyx_AddTraceback (line 21260) | static void __Pyx_AddTraceback(const char *funcname, int c_line,
  function __Pyx_GetBuffer (line 21300) | static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
  function __Pyx_ReleaseBuffer (line 21307) | static void __Pyx_ReleaseBuffer(Py_buffer *view) {
  function __pyx_memviewslice_is_contig (line 21322) | static int
  function __pyx_get_array_memory_extents (line 21344) | static void
  function __pyx_slices_overlap (line 21368) | static int
  function CYTHON_INLINE (line 21380) | static CYTHON_INLINE PyObject *
  function CYTHON_INLINE (line 21393) | static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
  function __Pyx_BufFmt_Init (line 21404) | static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
  function __Pyx_BufFmt_ParseNumber (line 21431) | static int __Pyx_BufFmt_ParseNumber(const char** ts) {
  function __Pyx_BufFmt_ExpectNumber (line 21446) | static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
  function __Pyx_BufFmt_RaiseUnexpectedChar (line 21453) | static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
  function __Pyx_BufFmt_TypeCharToStandardSize (line 21482) | static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_comple...
  function __Pyx_BufFmt_TypeCharToNativeSize (line 21500) | static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
  type __Pyx_st_short (line 21519) | typedef struct { char c; short x; } __Pyx_st_short;
  type __Pyx_st_int (line 21520) | typedef struct { char c; int x; } __Pyx_st_int;
  type __Pyx_st_long (line 21521) | typedef struct { char c; long x; } __Pyx_st_long;
  type __Pyx_st_float (line 21522) | typedef struct { char c; float x; } __Pyx_st_float;
  type __Pyx_st_double (line 21523) | typedef struct { char c; double x; } __Pyx_st_double;
  type __Pyx_st_longdouble (line 21524) | typedef struct { char c; long double x; } __Pyx_st_longdouble;
  type __Pyx_st_void_p (line 21525) | typedef struct { char c; void *x; } __Pyx_st_void_p;
  type __Pyx_st_longlong (line 21527) | typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
  function __Pyx_BufFmt_TypeCharToAlignment (line 21529) | static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED in...
  type __Pyx_pad_short (line 21551) | typedef struct { short x; char c; } __Pyx_pad_short;
  type __Pyx_pad_int (line 21552) | typedef struct { int x; char c; } __Pyx_pad_int;
  type __Pyx_pad_long (line 21553) | typedef struct { long x; char c; } __Pyx_pad_long;
  type __Pyx_pad_float (line 21554) | typedef struct { float x; char c; } __Pyx_pad_float;
  type __Pyx_pad_double (line 21555) | typedef struct { double x; char c; } __Pyx_pad_double;
  type __Pyx_pad_longdouble (line 21556) | typedef struct { long double x; char c; } __Pyx_pad_longdouble;
  type __Pyx_pad_void_p (line 21557) | typedef struct { void *x; char c; } __Pyx_pad_void_p;
  type __Pyx_pad_longlong (line 21559) | typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
  function __Pyx_BufFmt_TypeCharToPadding (line 21561) | static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int ...
  function __Pyx_BufFmt_TypeCharToGroup (line 21579) | static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
  function __Pyx_BufFmt_RaiseExpected (line 21600) | static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
  function __Pyx_BufFmt_ProcessTypeChunk (line 21624) | static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
  function PyObject (line 21726) | static PyObject *
  function __pyx_typeinfo_cmp (line 21906) | static int
  function __pyx_check_strides (line 21947) | static int
  function __pyx_check_suboffsets (line 22000) | static int
  function __pyx_verify_contig (line 22023) | static int
  function __Pyx_ValidateAndInit_memviewslice (line 22052) | static int __Pyx_ValidateAndInit_memviewslice(
  function CYTHON_INLINE (line 22128) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
  function CYTHON_INLINE (line 22151) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
  function CYTHON_INLINE (line 22174) | static CYTHON_INLINE __Pyx_memviewslice __Pyx_PyObject_to_MemoryviewSlic...
  function CYTHON_INLINE (line 22221) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
  function CYTHON_INLINE (line 22225) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
  function CYTHON_INLINE (line 22230) | static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_pa...
  function CYTHON_INLINE (line 22241) | static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx...
  function CYTHON_INLINE (line 22244) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_flo...
  function CYTHON_INLINE (line 22250) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_fl...
  function CYTHON_INLINE (line 22256) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_fl...
  function CYTHON_INLINE (line 22263) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_fl...
  function CYTHON_INLINE (line 22283) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_fl...
  function CYTHON_INLINE (line 22294) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_flo...
  function CYTHON_INLINE (line 22300) | static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
  function CYTHON_INLINE (line 22303) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_fl...
  function CYTHON_INLINE (line 22310) | static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
  function CYTHON_INLINE (line 22317) | static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_flo...
  function CYTHON_INLINE (line 22375) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
  function CYTHON_INLINE (line 22379) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
  function CYTHON_INLINE (line 22384) | static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_...
  function CYTHON_INLINE (line 22395) | static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __p...
  function CYTHON_INLINE (line 22398) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_d...
  function CYTHON_INLINE (line 22404) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_...
  function CYTHON_INLINE (line 22410) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_...
  function CYTHON_INLINE (line 22417) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_...
  function CYTHON_INLINE (line 22437) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_...
  function CYTHON_INLINE (line 22448) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_d...
  function CYTHON_INLINE (line 22454) | static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
  function CYTHON_INLINE (line 22457) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_...
  function CYTHON_INLINE (line 22464) | static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
  function CYTHON_INLINE (line 22471) | static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_d...
  function __Pyx_memviewslice (line 22527) | static __Pyx_memviewslice
  function CYTHON_INLINE (line 22594) | static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
  function CYTHON_INLINE (line 22828) | static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
  function __Pyx_check_binary_version (line 23258) | static int __Pyx_check_binary_version(void) {
  function __Pyx_InitStrings (line 23296) | static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
  function CYTHON_INLINE (line 23328) | static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_...
  function CYTHON_INLINE (line 23331) | static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
  function CYTHON_INLINE (line 23358) | static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObjec...
  function CYTHON_INLINE (line 23400) | static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
  function CYTHON_INLINE (line 23405) | static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
  function PyObject (line 23412) | static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* resul...
  function CYTHON_INLINE (line 23481) | static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
  function CYTHON_INLINE (line 23543) | static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
  function CYTHON_INLINE (line 23560) | static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
  function CYTHON_INLINE (line 23563) | static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {

FILE: third_party/Matcha-TTS/matcha/utils/pylogger.py
  function get_pylogger (line 6) | def get_pylogger(name: str = __name__) -> logging.Logger:

FILE: third_party/Matcha-TTS/matcha/utils/rich_utils.py
  function print_config_tree (line 18) | def print_config_tree(
  function enforce_tags (line 80) | def enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:

FILE: third_party/Matcha-TTS/matcha/utils/utils.py
  function extras (line 20) | def extras(cfg: DictConfig) -> None:
  function task_wrapper (line 51) | def task_wrapper(task_func: Callable) -> Callable:
  function get_metric_value (line 106) | def get_metric_value(metric_dict: Dict[str, Any], metric_name: str) -> f...
  function intersperse (line 130) | def intersperse(lst, item):
  function save_figure_to_numpy (line 137) | def save_figure_to_numpy(fig):
  function plot_tensor (line 143) | def plot_tensor(tensor):
  function save_plot (line 155) | def save_plot(tensor, savepath):
  function to_numpy (line 166) | def to_numpy(tensor):
  function get_user_data_dir (line 177) | def get_user_data_dir(appname="matcha_tts"):
  function assert_model_downloaded (line 208) | def assert_model_downloaded(checkpoint_path, url, use_wget=True):
Condensed preview — 158 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (4,368K chars).
[
  {
    "path": ".dockerignore",
    "chars": 991,
    "preview": "# Git\n.git\n.gitignore\n.gitattributes\n\n\n# CI\n.codeclimate.yml\n.travis.yml\n.taskcluster.yml\n\n# Docker\ndocker-compose.yml\nD"
  },
  {
    "path": "Dockerfile",
    "chars": 659,
    "preview": "FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04\nWORKDIR /breezyvoice\n\nENV UV_LINK_MODE=copy\nENV PATH=\"/root/.local/bi"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 7982,
    "preview": "# BreezyVoice\n\nBreezyVoice is a voice-cloning text-to-speech system specifically adapted for Taiwanese Mandarin, highlig"
  },
  {
    "path": "api.py",
    "chars": 3655,
    "preview": "# OpenAI API Spec. Reference: https://platform.openai.com/docs/api-reference/audio/createSpeech\n\nfrom contextlib import "
  },
  {
    "path": "batch_inference.py",
    "chars": 3421,
    "preview": "import os\r\nimport time\r\nimport subprocess\r\nimport argparse\r\nimport pandas as pd\r\nfrom datasets import Dataset\r\nfrom sing"
  },
  {
    "path": "compose.yaml",
    "chars": 358,
    "preview": "services:\n  app:\n    image: breezyvoice:latest\n    build: .\n    ports:\n      - \"8080:8080\"\n    volumes:\n      - hf-cache"
  },
  {
    "path": "cosyvoice/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "cosyvoice/bin/inference.py",
    "chars": 5386,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/bin/train.py",
    "chars": 5212,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/cli/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "cosyvoice/cli/cosyvoice.py",
    "chars": 4209,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/cli/frontend.py",
    "chars": 8999,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/cli/model.py",
    "chars": 3660,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/dataset/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "cosyvoice/dataset/dataset.py",
    "chars": 5233,
    "preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n#               2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licen"
  },
  {
    "path": "cosyvoice/dataset/processor.py",
    "chars": 13008,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\")"
  },
  {
    "path": "cosyvoice/flow/decoder.py",
    "chars": 8878,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/flow/flow.py",
    "chars": 5941,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/flow/flow_matching.py",
    "chars": 5882,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/flow/length_regulator.py",
    "chars": 1841,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/hifigan/f0_predictor.py",
    "chars": 1976,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "cosyvoice/hifigan/generator.py",
    "chars": 14736,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Kai Hu)\n#\n# Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "cosyvoice/llm/llm.py",
    "chars": 9184,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/transformer/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "cosyvoice/transformer/activation.py",
    "chars": 3087,
    "preview": "# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)\n#               2020 Northwestern Polytechnical Universi"
  },
  {
    "path": "cosyvoice/transformer/attention.py",
    "chars": 14196,
    "preview": "# Copyright (c) 2019 Shigeki Karita\n#               2020 Mobvoi Inc (Binbin Zhang)\n#               2022 Xingchen Song (s"
  },
  {
    "path": "cosyvoice/transformer/convolution.py",
    "chars": 5230,
    "preview": "# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n#               2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
  },
  {
    "path": "cosyvoice/transformer/decoder.py",
    "chars": 16591,
    "preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n#               2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
  },
  {
    "path": "cosyvoice/transformer/decoder_layer.py",
    "chars": 4807,
    "preview": "# Copyright (c) 2019 Shigeki Karita\n#               2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
  },
  {
    "path": "cosyvoice/transformer/embedding.py",
    "chars": 11316,
    "preview": "# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)\n#               2024 Alibaba Inc (Xiang Lyu)\n#\n# License"
  },
  {
    "path": "cosyvoice/transformer/encoder.py",
    "chars": 21401,
    "preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)\n#"
  },
  {
    "path": "cosyvoice/transformer/encoder_layer.py",
    "chars": 9589,
    "preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n#               2022 Xingchen Song (sxc19@mails.tsinghua.edu.cn)\n#"
  },
  {
    "path": "cosyvoice/transformer/label_smoothing_loss.py",
    "chars": 3459,
    "preview": "# Copyright (c) 2019 Shigeki Karita\n#               2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
  },
  {
    "path": "cosyvoice/transformer/positionwise_feed_forward.py",
    "chars": 4219,
    "preview": "# Copyright (c) 2019 Shigeki Karita\n#               2020 Mobvoi Inc (Binbin Zhang)\n#\n# Licensed under the Apache License"
  },
  {
    "path": "cosyvoice/transformer/subsampling.py",
    "chars": 12666,
    "preview": "# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)\n#               2024 Alibaba Inc (Xiang Lyu)\n#\n# Licensed under th"
  },
  {
    "path": "cosyvoice/utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "cosyvoice/utils/class_utils.py",
    "chars": 2582,
    "preview": "# Copyright [2023-11-28] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>\n#            2024 Alibaba Inc (authors: Xiang Lyu)"
  },
  {
    "path": "cosyvoice/utils/common.py",
    "chars": 3414,
    "preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n#               2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under "
  },
  {
    "path": "cosyvoice/utils/executor.py",
    "chars": 5114,
    "preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n#               2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licensed under "
  },
  {
    "path": "cosyvoice/utils/file_utils.py",
    "chars": 1839,
    "preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n#               2024 Alibaba Inc (authors: Xiang Lyu)\n#\n# Licen"
  },
  {
    "path": "cosyvoice/utils/frontend_utils.py",
    "chars": 4000,
    "preview": "# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Zhihao Du)\n#\n# Licensed under the Apache License, Version 2.0 (the"
  },
  {
    "path": "cosyvoice/utils/mask.py",
    "chars": 8351,
    "preview": "# Copyright (c) 2019 Shigeki Karita\n#               2020 Mobvoi Inc (Binbin Zhang)\n#               2024 Alibaba Inc (aut"
  },
  {
    "path": "cosyvoice/utils/scheduler.py",
    "chars": 24940,
    "preview": "# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)\n#               2022 Ximalaya Inc (Yuguang Yang)\n#               2024 Ali"
  },
  {
    "path": "cosyvoice/utils/train_utils.py",
    "chars": 11863,
    "preview": "# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)\n#               2023 Horizon Inc. (authors: Xingchen Song)\n#   "
  },
  {
    "path": "data/batch_files.csv",
    "chars": 815,
    "preview": "speaker_prompt_audio_filename,speaker,speaker_prompt_text_transcription,content_to_synthesize,output_audio_filename\nexam"
  },
  {
    "path": "openai_api_inference.py",
    "chars": 387,
    "preview": "from pathlib import Path\n\nimport openai\n\nclient = openai.Client(base_url=\"http://localhost:8080\", api_key=\"sk-template\")"
  },
  {
    "path": "requirements.txt",
    "chars": 908,
    "preview": "--extra-index-url https://download.pytorch.org/whl/cu118\nconformer==0.3.2\ndeepspeed==0.14.2; sys_platform == 'linux'\ndif"
  },
  {
    "path": "results/.gitkeep",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "run_batch_inference.sh",
    "chars": 352,
    "preview": "#!/bin/bash\n\n# Default parameters\nCSV_FILE=\"data/batch_files.csv\"\nSPEAKER_PROMPT_AUDIO_FOLDER=\"data\"\nOUTPUT_AUDIO_FOLDER"
  },
  {
    "path": "run_single_inference.sh",
    "chars": 269,
    "preview": "python3 single_inference.py --speaker_prompt_audio_path \"data/example.wav\" --speaker_prompt_text_transcription \"在密碼學中,加密"
  },
  {
    "path": "single_inference.py",
    "chars": 20953,
    "preview": "import argparse\r\nimport os\r\nimport sys\r\nimport re\r\nfrom functools import partial\r\nimport time\r\n\r\nimport torch\r\ntorch.set"
  },
  {
    "path": "third_party/Matcha-TTS/LICENSE",
    "chars": 1069,
    "preview": "MIT License\n\nCopyright (c) 2023 Shivam Mehta\n\nPermission is hereby granted, free of charge, to any person obtaining a co"
  },
  {
    "path": "third_party/Matcha-TTS/MANIFEST.in",
    "chars": 352,
    "preview": "include README.md\ninclude LICENSE.txt\ninclude requirements.*.txt\ninclude *.cff\ninclude requirements.txt\ninclude matcha/V"
  },
  {
    "path": "third_party/Matcha-TTS/Makefile",
    "chars": 1155,
    "preview": "\nhelp:  ## Show help\n\t@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = \":.*?## \"}; {printf \"\\033["
  },
  {
    "path": "third_party/Matcha-TTS/README.md",
    "chars": 8647,
    "preview": "<div align=\"center\">\n\n# 🍵 Matcha-TTS: A fast TTS architecture with conditional flow matching\n\n### [Shivam Mehta](https:/"
  },
  {
    "path": "third_party/Matcha-TTS/configs/__init__.py",
    "chars": 81,
    "preview": "# this file is needed here to include configs when building project as a package\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/callbacks/default.yaml",
    "chars": 97,
    "preview": "defaults:\n  - model_checkpoint.yaml\n  - model_summary.yaml\n  - rich_progress_bar.yaml\n  - _self_\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/callbacks/model_checkpoint.yaml",
    "chars": 1199,
    "preview": "# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html\n\nmodel_checkpoint:\n  _ta"
  },
  {
    "path": "third_party/Matcha-TTS/configs/callbacks/model_summary.yaml",
    "chars": 252,
    "preview": "# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html\n\nmodel_summary:\n  _targ"
  },
  {
    "path": "third_party/Matcha-TTS/configs/callbacks/none.yaml",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/configs/callbacks/rich_progress_bar.yaml",
    "chars": 172,
    "preview": "# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html\n\nrich_progress_bar:\n  _t"
  },
  {
    "path": "third_party/Matcha-TTS/configs/data/hi-fi_en-US_female.yaml",
    "chars": 472,
    "preview": "defaults:\n  - ljspeech\n  - _self_\n\n# Dataset URL: https://ast-astrec.nict.go.jp/en/release/hi-fi-captain/\n_target_: matc"
  },
  {
    "path": "third_party/Matcha-TTS/configs/data/ljspeech.yaml",
    "chars": 520,
    "preview": "_target_: matcha.data.text_mel_datamodule.TextMelDataModule\nname: ljspeech\ntrain_filelist_path: data/filelists/ljs_audio"
  },
  {
    "path": "third_party/Matcha-TTS/configs/data/vctk.yaml",
    "chars": 385,
    "preview": "defaults:\n  - ljspeech\n  - _self_\n\n_target_: matcha.data.text_mel_datamodule.TextMelDataModule\nname: vctk\ntrain_filelist"
  },
  {
    "path": "third_party/Matcha-TTS/configs/debug/default.yaml",
    "chars": 903,
    "preview": "# @package _global_\n\n# default debugging setup, runs 1 full epoch\n# other debugging configs can inherit from this one\n\n#"
  },
  {
    "path": "third_party/Matcha-TTS/configs/debug/fdr.yaml",
    "chars": 120,
    "preview": "# @package _global_\n\n# runs 1 train, 1 validation and 1 test step\n\ndefaults:\n  - default\n\ntrainer:\n  fast_dev_run: true\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/debug/limit.yaml",
    "chars": 218,
    "preview": "# @package _global_\n\n# uses only 1% of the training data and 5% of validation/test data\n\ndefaults:\n  - default\n\ntrainer:"
  },
  {
    "path": "third_party/Matcha-TTS/configs/debug/overfit.yaml",
    "chars": 204,
    "preview": "# @package _global_\n\n# overfits to 3 batches\n\ndefaults:\n  - default\n\ntrainer:\n  max_epochs: 20\n  overfit_batches: 3\n\n# m"
  },
  {
    "path": "third_party/Matcha-TTS/configs/debug/profiler.yaml",
    "chars": 225,
    "preview": "# @package _global_\n\n# runs with execution time profiling\n\ndefaults:\n  - default\n\ntrainer:\n  max_epochs: 1\n  # profiler:"
  },
  {
    "path": "third_party/Matcha-TTS/configs/eval.yaml",
    "chars": 335,
    "preview": "# @package _global_\n\ndefaults:\n  - _self_\n  - data: mnist # choose datamodule with `test_dataloader()` for evaluation\n  "
  },
  {
    "path": "third_party/Matcha-TTS/configs/experiment/hifi_dataset_piper_phonemizer.yaml",
    "chars": 423,
    "preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n  - override"
  },
  {
    "path": "third_party/Matcha-TTS/configs/experiment/ljspeech.yaml",
    "chars": 332,
    "preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n  - override"
  },
  {
    "path": "third_party/Matcha-TTS/configs/experiment/ljspeech_min_memory.yaml",
    "chars": 361,
    "preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n  - override"
  },
  {
    "path": "third_party/Matcha-TTS/configs/experiment/multispeaker.yaml",
    "chars": 336,
    "preview": "# @package _global_\n\n# to execute this experiment run:\n# python train.py experiment=multispeaker\n\ndefaults:\n  - override"
  },
  {
    "path": "third_party/Matcha-TTS/configs/extras/default.yaml",
    "chars": 232,
    "preview": "# disable python warnings if they annoy you\nignore_warnings: False\n\n# ask user for tags if none are provided in the conf"
  },
  {
    "path": "third_party/Matcha-TTS/configs/hparams_search/mnist_optuna.yaml",
    "chars": 1818,
    "preview": "# @package _global_\n\n# example hyperparameter optimization of some experiment with Optuna:\n# python train.py -m hparams_"
  },
  {
    "path": "third_party/Matcha-TTS/configs/hydra/default.yaml",
    "chars": 608,
    "preview": "# https://hydra.cc/docs/configure_hydra/intro/\n\n# enable color logging\ndefaults:\n  - override hydra_logging: colorlog\n  "
  },
  {
    "path": "third_party/Matcha-TTS/configs/local/.gitkeep",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/aim.yaml",
    "chars": 1267,
    "preview": "# https://aimstack.io/\n\n# example usage in lightning module:\n# https://github.com/aimhubio/aim/blob/main/examples/pytorc"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/comet.yaml",
    "chars": 372,
    "preview": "# https://www.comet.ml\n\ncomet:\n  _target_: lightning.pytorch.loggers.comet.CometLogger\n  api_key: ${oc.env:COMET_API_TOK"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/csv.yaml",
    "chars": 157,
    "preview": "# csv logger built in lightning\n\ncsv:\n  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger\n  save_dir: \"${paths.outp"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/many_loggers.yaml",
    "chars": 118,
    "preview": "# train with many loggers at once\n\ndefaults:\n  # - comet\n  - csv\n  # - mlflow\n  # - neptune\n  - tensorboard\n  - wandb\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/mlflow.yaml",
    "chars": 339,
    "preview": "# https://mlflow.org\n\nmlflow:\n  _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger\n  # experiment_name: \"\"\n  # run_"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/neptune.yaml",
    "chars": 277,
    "preview": "# https://neptune.ai\n\nneptune:\n  _target_: lightning.pytorch.loggers.neptune.NeptuneLogger\n  api_key: ${oc.env:NEPTUNE_A"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/tensorboard.yaml",
    "chars": 258,
    "preview": "# https://www.tensorflow.org/tensorboard/\n\ntensorboard:\n  _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLog"
  },
  {
    "path": "third_party/Matcha-TTS/configs/logger/wandb.yaml",
    "chars": 522,
    "preview": "# https://wandb.ai\n\nwandb:\n  _target_: lightning.pytorch.loggers.wandb.WandbLogger\n  # name: \"\" # name of the run (norma"
  },
  {
    "path": "third_party/Matcha-TTS/configs/model/cfm/default.yaml",
    "chars": 40,
    "preview": "name: CFM\nsolver: euler\nsigma_min: 1e-4\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/model/decoder/default.yaml",
    "chars": 119,
    "preview": "channels: [256, 256]\ndropout: 0.05\nattention_head_dim: 64\nn_blocks: 1\nnum_mid_blocks: 2\nnum_heads: 2\nact_fn: snakebeta\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/model/encoder/default.yaml",
    "chars": 417,
    "preview": "encoder_type: RoPE Encoder\nencoder_params:\n  n_feats: ${model.n_feats}\n  n_channels: 192\n  filter_channels: 768\n  filter"
  },
  {
    "path": "third_party/Matcha-TTS/configs/model/matcha.yaml",
    "chars": 328,
    "preview": "defaults:\n  - _self_\n  - encoder: default.yaml\n  - decoder: default.yaml\n  - cfm: default.yaml\n  - optimizer: adam.yaml\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/model/optimizer/adam.yaml",
    "chars": 70,
    "preview": "_target_: torch.optim.Adam\n_partial_: true\nlr: 1e-4\nweight_decay: 0.0\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/paths/default.yaml",
    "chars": 632,
    "preview": "# path to root directory\n# this requires PROJECT_ROOT environment variable to exist\n# you can replace it with \".\" if you"
  },
  {
    "path": "third_party/Matcha-TTS/configs/train.yaml",
    "chars": 1557,
    "preview": "# @package _global_\n\n# specify here default configuration\n# order of defaults determines the order in which configs over"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/cpu.yaml",
    "chars": 51,
    "preview": "defaults:\n  - default\n\naccelerator: cpu\ndevices: 1\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/ddp.yaml",
    "chars": 104,
    "preview": "defaults:\n  - default\n\nstrategy: ddp\n\naccelerator: gpu\ndevices: [0,1]\nnum_nodes: 1\nsync_batchnorm: True\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/ddp_sim.yaml",
    "chars": 115,
    "preview": "defaults:\n  - default\n\n# simulate DDP on CPU, useful for debugging\naccelerator: cpu\ndevices: 2\nstrategy: ddp_spawn\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/default.yaml",
    "chars": 439,
    "preview": "_target_: lightning.pytorch.trainer.Trainer\n\ndefault_root_dir: ${paths.output_dir}\n\nmax_epochs: -1\n\naccelerator: gpu\ndev"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/gpu.yaml",
    "chars": 51,
    "preview": "defaults:\n  - default\n\naccelerator: gpu\ndevices: 1\n"
  },
  {
    "path": "third_party/Matcha-TTS/configs/trainer/mps.yaml",
    "chars": 51,
    "preview": "defaults:\n  - default\n\naccelerator: mps\ndevices: 1\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/VERSION",
    "chars": 8,
    "preview": "0.0.5.1\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/app.py",
    "chars": 13981,
    "preview": "import tempfile\nfrom argparse import Namespace\nfrom pathlib import Path\n\nimport gradio as gr\nimport soundfile as sf\nimpo"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/cli.py",
    "chars": 15467,
    "preview": "import argparse\nimport datetime as dt\nimport os\nimport warnings\nfrom pathlib import Path\n\nimport matplotlib.pyplot as pl"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/data/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/data/components/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/data/text_mel_datamodule.py",
    "chars": 7555,
    "preview": "import random\nfrom typing import Any, Dict, Optional\n\nimport torch\nimport torchaudio as ta\nfrom lightning import Lightni"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/LICENSE",
    "chars": 1068,
    "preview": "MIT License\n\nCopyright (c) 2020 Jungil Kong\n\nPermission is hereby granted, free of charge, to any person obtaining a cop"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/README.md",
    "chars": 5570,
    "preview": "# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis\n\n### Jungil Kong, Jaehyeon "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/config.py",
    "chars": 779,
    "preview": "v1 = {\n    \"resblock\": \"1\",\n    \"num_gpus\": 0,\n    \"batch_size\": 16,\n    \"learning_rate\": 0.0004,\n    \"adam_b1\": 0.8,\n  "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/denoiser.py",
    "chars": 2644,
    "preview": "# Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/env.py",
    "chars": 429,
    "preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport os\nimport shutil\n\n\nclass AttrDict(dict):\n    def __init__(self, "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/meldataset.py",
    "chars": 6786,
    "preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport math\nimport os\nimport random\n\nimport numpy as np\nimport torch\nim"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/models.py",
    "chars": 11668,
    "preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/hifigan/xutils.py",
    "chars": 1396,
    "preview": "\"\"\" from https://github.com/jik876/hifi-gan \"\"\"\n\nimport glob\nimport os\n\nimport matplotlib\nimport torch\nfrom torch.nn.uti"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/baselightningmodule.py",
    "chars": 7003,
    "preview": "\"\"\"\nThis is a base lightning module that can be used to train a model.\nThe benefit of this abstraction is that all the l"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/components/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/components/decoder.py",
    "chars": 14459,
    "preview": "import math\nfrom typing import Optional\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom conform"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/components/flow_matching.py",
    "chars": 4657,
    "preview": "from abc import ABC\n\nimport torch\nimport torch.nn.functional as F\n\nfrom matcha.models.components.decoder import Decoder\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/components/text_encoder.py",
    "chars": 14845,
    "preview": "\"\"\" from https://github.com/jaywalnut310/glow-tts \"\"\"\n\nimport math\n\nimport torch\nimport torch.nn as nn\nfrom einops impor"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/components/transformer.py",
    "chars": 13235,
    "preview": "from typing import Any, Dict, Optional\n\nimport torch\nimport torch.nn as nn\nfrom diffusers.models.attention import (\n    "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/models/matcha_tts.py",
    "chars": 10056,
    "preview": "import datetime as dt\nimport math\nimport random\n\nimport torch\n\nimport matcha.utils.monotonic_align as monotonic_align\nfr"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/onnx/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/matcha/onnx/export.py",
    "chars": 5377,
    "preview": "import argparse\nimport random\nfrom pathlib import Path\n\nimport numpy as np\nimport torch\nfrom lightning import LightningM"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/onnx/infer.py",
    "chars": 6287,
    "preview": "import argparse\nimport os\nimport warnings\nfrom pathlib import Path\nfrom time import perf_counter\n\nimport numpy as np\nimp"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/text/__init__.py",
    "chars": 1696,
    "preview": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\nfrom matcha.text import cleaners\nfrom matcha.text.symbols import symbo"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/text/cleaners.py",
    "chars": 3560,
    "preview": "\"\"\" from https://github.com/keithito/tacotron\n\nCleaners are transformations that run over the input text at both trainin"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/text/numbers.py",
    "chars": 2248,
    "preview": "\"\"\" from https://github.com/keithito/tacotron \"\"\"\n\nimport re\n\nimport inflect\n\n_inflect = inflect.engine()\n_comma_number_"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/text/symbols.py",
    "chars": 509,
    "preview": "\"\"\" from https://github.com/keithito/tacotron\n\nDefines the set of symbols used in text input to the model.\n\"\"\"\n_pad = \"_"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/train.py",
    "chars": 4613,
    "preview": "from typing import Any, Dict, List, Optional, Tuple\n\nimport hydra\nimport lightning as L\nimport rootutils\nfrom lightning "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/__init__.py",
    "chars": 326,
    "preview": "from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers\nfrom matcha.utils.logging_utils import"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/audio.py",
    "chars": 2282,
    "preview": "import numpy as np\nimport torch\nimport torch.utils.data\nfrom librosa.filters import mel as librosa_mel_fn\nfrom scipy.io."
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/generate_data_statistics.py",
    "chars": 3269,
    "preview": "r\"\"\"\nThe file creates a pickle file where the values needed for loading of dataset is stored and the model can load it\nw"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/instantiators.py",
    "chars": 1828,
    "preview": "from typing import List\n\nimport hydra\nfrom lightning import Callback\nfrom lightning.pytorch.loggers import Logger\nfrom o"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/logging_utils.py",
    "chars": 1711,
    "preview": "from typing import Any, Dict\n\nfrom lightning.pytorch.utilities import rank_zero_only\nfrom omegaconf import OmegaConf\n\nfr"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/model.py",
    "chars": 2935,
    "preview": "\"\"\" from https://github.com/jaywalnut310/glow-tts \"\"\"\n\nimport numpy as np\nimport torch\n\n\ndef sequence_mask(length, max_l"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/__init__.py",
    "chars": 646,
    "preview": "import numpy as np\nimport torch\n\nfrom matcha.utils.monotonic_align.core import maximum_path_c\n\n\ndef maximum_path(value, "
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/core.c",
    "chars": 867828,
    "preview": "/* Generated by Cython 0.29.35 */\n\n/* BEGIN: Cython Metadata\n{\n    \"distutils\": {\n        \"depends\": [],\n        \"name\":"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/core.pyx",
    "chars": 1236,
    "preview": "import numpy as np\n\ncimport cython\ncimport numpy as np\n\nfrom cython.parallel import prange\n\n\n@cython.boundscheck(False)\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/monotonic_align/setup.py",
    "chars": 207,
    "preview": "# from distutils.core import setup\n# from Cython.Build import cythonize\n# import numpy\n\n# setup(name='monotonic_align',\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/pylogger.py",
    "chars": 720,
    "preview": "import logging\n\nfrom lightning.pytorch.utilities import rank_zero_only\n\n\ndef get_pylogger(name: str = __name__) -> loggi"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/rich_utils.py",
    "chars": 3279,
    "preview": "from pathlib import Path\nfrom typing import Sequence\n\nimport rich\nimport rich.syntax\nimport rich.tree\nfrom hydra.core.hy"
  },
  {
    "path": "third_party/Matcha-TTS/matcha/utils/utils.py",
    "chars": 7159,
    "preview": "import os\nimport sys\nimport warnings\nfrom importlib.util import find_spec\nfrom pathlib import Path\nfrom typing import An"
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/PKG-INFO",
    "chars": 9844,
    "preview": "Metadata-Version: 2.1\nName: matcha-tts\nVersion: 0.0.5.1\nSummary: 🍵 Matcha-TTS: A fast TTS architecture with conditional "
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/SOURCES.txt",
    "chars": 1518,
    "preview": "LICENSE\nMANIFEST.in\nREADME.md\npyproject.toml\nrequirements.txt\nsetup.py\nconfigs/__init__.py\nmatcha/VERSION\nmatcha/__init_"
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/dependency_links.txt",
    "chars": 1,
    "preview": "\n"
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/entry_points.txt",
    "chars": 142,
    "preview": "[console_scripts]\nmatcha-data-stats = matcha.utils.generate_data_statistics:main\nmatcha-tts = matcha.cli:cli\nmatcha-tts-"
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/requires.txt",
    "chars": 381,
    "preview": "torch>=2.0.0\ntorchvision>=0.15.0\nlightning>=2.0.0\ntorchmetrics>=0.11.4\nhydra-core==1.3.2\nhydra-colorlog==1.2.0\nhydra-opt"
  },
  {
    "path": "third_party/Matcha-TTS/matcha_tts.egg-info/top_level.txt",
    "chars": 15,
    "preview": "configs\nmatcha\n"
  },
  {
    "path": "third_party/Matcha-TTS/notebooks/.gitkeep",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/Matcha-TTS/pyproject.toml",
    "chars": 982,
    "preview": "[build-system]\nrequires = [\"setuptools\", \"wheel\", \"cython==0.29.35\", \"numpy==1.24.3\", \"packaging\"]\n\n[tool.black]\nline-le"
  },
  {
    "path": "third_party/Matcha-TTS/requirements.txt",
    "chars": 904,
    "preview": "# --------- pytorch --------- #\ntorch>=2.0.0\ntorchvision>=0.15.0\nlightning>=2.0.0\ntorchmetrics>=0.11.4\n\n# --------- hydr"
  },
  {
    "path": "third_party/Matcha-TTS/scripts/schedule.sh",
    "chars": 207,
    "preview": "#!/bin/bash\n# Schedule execution of many runs\n# Run from root folder with: bash scripts/schedule.sh\n\npython src/train.py"
  },
  {
    "path": "third_party/Matcha-TTS/setup.py",
    "chars": 1527,
    "preview": "#!/usr/bin/env python\nimport os\n\nimport numpy\nfrom Cython.Build import cythonize\nfrom setuptools import Extension, find_"
  },
  {
    "path": "third_party/Matcha-TTS/synthesis.ipynb",
    "chars": 590014,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"id\": \"f37f4e3b-f764-4502-a6a2-6417bd9bfab9\",\n   \"metadata\": {},\n   \"so"
  },
  {
    "path": "utils/word_utils.py",
    "chars": 1624178,
    "preview": "from collections import Counter, defaultdict\n\nalways_augment_chars = {\"長\"}\n\nchar2phn = {\n    \"〇\": [\n        \"ㄌㄧㄥ2\",\n    "
  }
]

About this extraction

This page contains the full source code of the mtkresearch/BreezyVoice GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 158 files (3.5 MB), approximately 913.5k tokens, and a symbol index with 1094 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!