Copy disabled (too large)
Download .txt
Showing preview only (10,955K chars total). Download the full file to get everything.
Repository: RVC-Boss/GPT-SoVITS
Branch: main
Commit: 2d9193b0d3c0
Files: 256
Total size: 26.8 MB
Directory structure:
gitextract_whvdqko7/
├── .dockerignore
├── .github/
│ ├── build_windows_packages.ps1
│ └── workflows/
│ ├── build_windows_packages.yaml
│ └── docker-publish.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Colab-Inference.ipynb
├── Colab-WebUI.ipynb
├── Docker/
│ ├── install_wrapper.sh
│ └── miniforge_install.sh
├── Dockerfile
├── GPT_SoVITS/
│ ├── AR/
│ │ ├── __init__.py
│ │ ├── data/
│ │ │ ├── __init__.py
│ │ │ ├── bucket_sampler.py
│ │ │ ├── data_module.py
│ │ │ └── dataset.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── t2s_lightning_module.py
│ │ │ ├── t2s_lightning_module_onnx.py
│ │ │ ├── t2s_model.py
│ │ │ ├── t2s_model_onnx.py
│ │ │ └── utils.py
│ │ ├── modules/
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── activation_onnx.py
│ │ │ ├── embedding.py
│ │ │ ├── embedding_onnx.py
│ │ │ ├── lr_schedulers.py
│ │ │ ├── optim.py
│ │ │ ├── patched_mha_with_cache.py
│ │ │ ├── patched_mha_with_cache_onnx.py
│ │ │ ├── scaling.py
│ │ │ ├── transformer.py
│ │ │ └── transformer_onnx.py
│ │ ├── text_processing/
│ │ │ ├── __init__.py
│ │ │ ├── phonemizer.py
│ │ │ └── symbols.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── initialize.py
│ │ └── io.py
│ ├── BigVGAN/
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── activations.py
│ │ ├── alias_free_activation/
│ │ │ ├── cuda/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── activation1d.py
│ │ │ │ ├── anti_alias_activation.cpp
│ │ │ │ ├── anti_alias_activation_cuda.cu
│ │ │ │ ├── compat.h
│ │ │ │ ├── load.py
│ │ │ │ └── type_shim.h
│ │ │ └── torch/
│ │ │ ├── __init__.py
│ │ │ ├── act.py
│ │ │ ├── filter.py
│ │ │ └── resample.py
│ │ ├── bigvgan.py
│ │ ├── configs/
│ │ │ ├── bigvgan_22khz_80band.json
│ │ │ ├── bigvgan_24khz_100band.json
│ │ │ ├── bigvgan_base_22khz_80band.json
│ │ │ ├── bigvgan_base_24khz_100band.json
│ │ │ ├── bigvgan_v2_22khz_80band_256x.json
│ │ │ ├── bigvgan_v2_22khz_80band_fmax8k_256x.json
│ │ │ ├── bigvgan_v2_24khz_100band_256x.json
│ │ │ ├── bigvgan_v2_44khz_128band_256x.json
│ │ │ └── bigvgan_v2_44khz_128band_512x.json
│ │ ├── discriminators.py
│ │ ├── env.py
│ │ ├── incl_licenses/
│ │ │ ├── LICENSE_1
│ │ │ ├── LICENSE_2
│ │ │ ├── LICENSE_3
│ │ │ ├── LICENSE_4
│ │ │ ├── LICENSE_5
│ │ │ ├── LICENSE_6
│ │ │ ├── LICENSE_7
│ │ │ └── LICENSE_8
│ │ ├── inference.py
│ │ ├── inference_e2e.py
│ │ ├── loss.py
│ │ ├── meldataset.py
│ │ ├── nv-modelcard++/
│ │ │ ├── .gitkeep
│ │ │ ├── bias.md
│ │ │ ├── explainability.md
│ │ │ ├── overview.md
│ │ │ ├── privacy.md
│ │ │ └── safety.md
│ │ ├── requirements.txt
│ │ ├── tests/
│ │ │ ├── test_activation.py
│ │ │ ├── test_activation_snake_beta.py
│ │ │ └── test_cuda_vs_torch_model.py
│ │ ├── train.py
│ │ └── utils0.py
│ ├── TTS_infer_pack/
│ │ ├── TTS.py
│ │ ├── TextPreprocessor.py
│ │ ├── __init__.py
│ │ └── text_segmentation_method.py
│ ├── configs/
│ │ ├── .gitignore
│ │ ├── s2.json
│ │ ├── s2v2Pro.json
│ │ └── s2v2ProPlus.json
│ ├── download.py
│ ├── eres2net/
│ │ ├── ERes2Net.py
│ │ ├── ERes2NetV2.py
│ │ ├── ERes2Net_huge.py
│ │ ├── fusion.py
│ │ ├── kaldi.py
│ │ └── pooling_layers.py
│ ├── export_torch_script.py
│ ├── export_torch_script_v3v4.py
│ ├── f5_tts/
│ │ └── model/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── README.md
│ │ │ ├── dit.py
│ │ │ ├── mmdit.py
│ │ │ └── unett.py
│ │ └── modules.py
│ ├── feature_extractor/
│ │ ├── __init__.py
│ │ ├── cnhubert.py
│ │ └── whisper_enc.py
│ ├── inference_cli.py
│ ├── inference_gui.py
│ ├── inference_webui.py
│ ├── inference_webui_fast.py
│ ├── module/
│ │ ├── __init__.py
│ │ ├── attentions.py
│ │ ├── attentions_onnx.py
│ │ ├── commons.py
│ │ ├── core_vq.py
│ │ ├── data_utils.py
│ │ ├── ddp_utils.py
│ │ ├── distrib.py
│ │ ├── losses.py
│ │ ├── mel_processing.py
│ │ ├── models.py
│ │ ├── models_onnx.py
│ │ ├── modules.py
│ │ ├── mrte_model.py
│ │ ├── quantize.py
│ │ └── transforms.py
│ ├── onnx_export.py
│ ├── prepare_datasets/
│ │ ├── 1-get-text.py
│ │ ├── 2-get-hubert-wav32k.py
│ │ ├── 2-get-sv.py
│ │ └── 3-get-semantic.py
│ ├── pretrained_models/
│ │ └── .gitignore
│ ├── process_ckpt.py
│ ├── s1_train.py
│ ├── s2_train.py
│ ├── s2_train_v3.py
│ ├── s2_train_v3_lora.py
│ ├── stream_v2pro.py
│ ├── sv.py
│ ├── text/
│ │ ├── .gitignore
│ │ ├── LangSegmenter/
│ │ │ ├── __init__.py
│ │ │ └── langsegmenter.py
│ │ ├── __init__.py
│ │ ├── cantonese.py
│ │ ├── chinese.py
│ │ ├── chinese2.py
│ │ ├── cleaner.py
│ │ ├── cmudict-fast.rep
│ │ ├── cmudict.rep
│ │ ├── en_normalization/
│ │ │ └── expend.py
│ │ ├── engdict-hot.rep
│ │ ├── engdict_cache.pickle
│ │ ├── english.py
│ │ ├── g2pw/
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ ├── g2pw.py
│ │ │ ├── onnx_api.py
│ │ │ ├── polyphonic-fix.rep
│ │ │ ├── polyphonic.pickle
│ │ │ ├── polyphonic.rep
│ │ │ └── utils.py
│ │ ├── ja_userdic/
│ │ │ └── userdict.csv
│ │ ├── japanese.py
│ │ ├── korean.py
│ │ ├── namedict_cache.pickle
│ │ ├── opencpop-strict.txt
│ │ ├── symbols.py
│ │ ├── symbols2.py
│ │ ├── tone_sandhi.py
│ │ └── zh_normalization/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── char_convert.py
│ │ ├── chronology.py
│ │ ├── constants.py
│ │ ├── num.py
│ │ ├── phonecode.py
│ │ ├── quantifier.py
│ │ └── text_normlization.py
│ └── utils.py
├── LICENSE
├── README.md
├── api.py
├── api_v2.py
├── config.py
├── docker-compose.yaml
├── docker_build.sh
├── docs/
│ ├── cn/
│ │ ├── Changelog_CN.md
│ │ └── README.md
│ ├── en/
│ │ └── Changelog_EN.md
│ ├── ja/
│ │ ├── Changelog_JA.md
│ │ └── README.md
│ ├── ko/
│ │ ├── Changelog_KO.md
│ │ └── README.md
│ └── tr/
│ ├── Changelog_TR.md
│ └── README.md
├── extra-req.txt
├── go-webui.bat
├── go-webui.ps1
├── install.ps1
├── install.sh
├── requirements.txt
├── tools/
│ ├── AP_BWE_main/
│ │ ├── 24kto48k/
│ │ │ └── readme.txt
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── datasets1/
│ │ │ ├── __init__.py
│ │ │ └── dataset.py
│ │ └── models/
│ │ ├── __init__.py
│ │ └── model.py
│ ├── __init__.py
│ ├── asr/
│ │ ├── config.py
│ │ ├── fasterwhisper_asr.py
│ │ ├── funasr_asr.py
│ │ └── models/
│ │ └── .gitignore
│ ├── assets.py
│ ├── audio_sr.py
│ ├── cmd-denoise.py
│ ├── denoise-model/
│ │ └── .gitignore
│ ├── i18n/
│ │ ├── i18n.py
│ │ ├── locale/
│ │ │ ├── en_US.json
│ │ │ ├── es_ES.json
│ │ │ ├── fr_FR.json
│ │ │ ├── it_IT.json
│ │ │ ├── ja_JP.json
│ │ │ ├── ko_KR.json
│ │ │ ├── pt_BR.json
│ │ │ ├── ru_RU.json
│ │ │ ├── tr_TR.json
│ │ │ ├── zh_CN.json
│ │ │ ├── zh_HK.json
│ │ │ ├── zh_SG.json
│ │ │ └── zh_TW.json
│ │ └── scan_i18n.py
│ ├── my_utils.py
│ ├── slice_audio.py
│ ├── slicer2.py
│ ├── subfix_webui.py
│ └── uvr5/
│ ├── bs_roformer/
│ │ ├── __init__.py
│ │ ├── attend.py
│ │ ├── bs_roformer.py
│ │ └── mel_band_roformer.py
│ ├── bsroformer.py
│ ├── mdxnet.py
│ ├── uvr5_weights/
│ │ └── .gitignore
│ ├── vr.py
│ └── webui.py
└── webui.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
GPT_SoVITS/pretrained_models/*
tools/asr/models/*
tools/uvr5/uvr5_weights/*
.git
.DS_Store
.vscode
*.pyc
env
runtime
.idea
output
logs
SoVITS_weights*/
GPT_weights*/
TEMP
weight.json
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
# Byte-compiled / optimized / DLL files
__pycache__/
**/__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
================================================
FILE: .github/build_windows_packages.ps1
================================================
$ErrorActionPreference = "Stop"
Write-Host "Current location: $(Get-Location)"
$cuda = $env:TORCH_CUDA
if (-not $cuda) {
Write-Error "Missing TORCH_CUDA env (cu124 or cu128)"
exit 1
}
$date = $env:DATE_SUFFIX
if ([string]::IsNullOrWhiteSpace($date)) {
$date = Get-Date -Format "MMdd"
}
$pkgName = "GPT-SoVITS-$date"
$tmpDir = "tmp"
$srcDir = $PWD
$suffix = $env:PKG_SUFFIX
if (-not [string]::IsNullOrWhiteSpace($suffix)) {
$pkgName = "$pkgName$suffix"
}
$pkgName = "$pkgName-$cuda"
$baseHF = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main"
$PRETRAINED_URL = "$baseHF/pretrained_models.zip"
$G2PW_URL = "$baseHF/G2PWModel.zip"
$UVR5_URL = "$baseHF/uvr5_weights.zip"
$NLTK_URL = "$baseHF/nltk_data.zip"
$JTALK_URL = "$baseHF/open_jtalk_dic_utf_8-1.11.tar.gz"
$PYTHON_VERSION = "3.11.12"
$PY_RELEASE_VERSION = "20250409"
Write-Host "[INFO] Cleaning .git..."
Remove-Item "$srcDir\.git" -Recurse -Force -ErrorAction SilentlyContinue
Write-Host "[INFO] Creating tmp dir..."
New-Item -ItemType Directory -Force -Path $tmpDir
Write-Host "[INFO] System Python version:"
python --version
python -m site
Write-Host "[INFO] Downloading Python $PYTHON_VERSION..."
$zst = "$tmpDir\python.tar.zst"
Invoke-WebRequest "https://github.com/astral-sh/python-build-standalone/releases/download/$PY_RELEASE_VERSION/cpython-$PYTHON_VERSION+$PY_RELEASE_VERSION-x86_64-pc-windows-msvc-pgo-full.tar.zst" -OutFile $zst
& "C:\Program Files\7-Zip\7z.exe" e $zst -o"$tmpDir" -aoa
$tar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
& "C:\Program Files\7-Zip\7z.exe" x $tar.FullName -o"$tmpDir\extracted" -aoa
Move-Item "$tmpDir\extracted\python\install" "$srcDir\runtime"
Write-Host "[INFO] Copying Redistributing Visual C++ Runtime..."
$vswhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
$vsPath = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath
$redistRoot = Join-Path $vsPath "VC\Redist\MSVC"
$targetVer = Get-ChildItem -Path $redistRoot -Directory |
Where-Object { $_.Name -match "^14\." } |
Sort-Object Name -Descending |
Select-Object -First 1
$x64Path = Join-Path $targetVer.FullName "x64"
Get-ChildItem -Path $x64Path -Directory | Where-Object {
$_.Name -match '^Microsoft\..*\.(CRT|OpenMP)$'
} | ForEach-Object {
Get-ChildItem -Path $_.FullName -Filter "*.dll" | ForEach-Object {
Copy-Item -Path $_.FullName -Destination "$srcDir\runtime" -Force
}
}
function DownloadAndUnzip($url, $targetRelPath) {
$filename = Split-Path $url -Leaf
$tmpZip = "$tmpDir\$filename"
Invoke-WebRequest $url -OutFile $tmpZip
Expand-Archive -Path $tmpZip -DestinationPath $tmpDir -Force
$subdirName = $filename -replace '\.zip$', ''
$sourcePath = Join-Path $tmpDir $subdirName
$destRoot = Join-Path $srcDir $targetRelPath
$destPath = Join-Path $destRoot $subdirName
if (Test-Path $destPath) {
Remove-Item $destPath -Recurse -Force
}
Move-Item $sourcePath $destRoot
Remove-Item $tmpZip
}
Write-Host "[INFO] Download pretrained_models..."
DownloadAndUnzip $PRETRAINED_URL "GPT_SoVITS"
Write-Host "[INFO] Download G2PWModel..."
DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"
Write-Host "[INFO] Download UVR5 model..."
DownloadAndUnzip $UVR5_URL "tools\uvr5"
Write-Host "[INFO] Downloading funasr..."
$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
$funasrZip = "$tmpDir\funasr.zip"
Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
Remove-Item $funasrZip
Write-Host "[INFO] Download ffmpeg..."
$ffUrl = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
$ffZip = "$tmpDir\ffmpeg.zip"
Invoke-WebRequest -Uri $ffUrl -OutFile $ffZip
Expand-Archive $ffZip -DestinationPath $tmpDir -Force
$ffDir = Get-ChildItem -Directory "$tmpDir" | Where-Object { $_.Name -like "ffmpeg*" } | Select-Object -First 1
Move-Item "$($ffDir.FullName)\bin\ffmpeg.exe" "$srcDir\runtime"
Move-Item "$($ffDir.FullName)\bin\ffprobe.exe" "$srcDir\runtime"
Remove-Item $ffZip
Remove-Item $ffDir.FullName -Recurse -Force
Write-Host "[INFO] Installing PyTorch..."
& ".\runtime\python.exe" -m ensurepip
& ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location
switch ($cuda) {
"cu124" {
& ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location
}
"cu128" {
& ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location
}
default {
Write-Error "Unsupported CUDA version: $cuda"
exit 1
}
}
Write-Host "[INFO] Installing dependencies..."
& ".\runtime\python.exe" -m pip install -r extra-req.txt --no-deps --no-warn-script-location
& ".\runtime\python.exe" -m pip install -r requirements.txt --no-warn-script-location
Write-Host "[INFO] Downloading NLTK and pyopenjtalk dictionary..."
$PYTHON = ".\runtime\python.exe"
$prefix = & $PYTHON -c "import sys; print(sys.prefix)"
$jtalkPath = & $PYTHON -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))"
$nltkZip = "$tmpDir\nltk_data.zip"
$jtalkTar = "$tmpDir\open_jtalk_dic_utf_8-1.11.tar.gz"
Invoke-WebRequest -Uri $NLTK_URL -OutFile $nltkZip
Expand-Archive -Path $nltkZip -DestinationPath $prefix -Force
Remove-Item $nltkZip
Invoke-WebRequest -Uri $JTALK_URL -OutFile $jtalkTar
& "C:\Program Files\7-Zip\7z.exe" e $jtalkTar -o"$tmpDir" -aoa
$innerTar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1
& "C:\Program Files\7-Zip\7z.exe" x $innerTar.FullName -o"$jtalkPath" -aoa
Remove-Item $jtalkTar
Remove-Item $innerTar.FullName
Write-Host "[INFO] Preparing final directory $pkgName ..."
$items = @(Get-ChildItem -Filter "*.sh") +
@(Get-ChildItem -Filter "*.ipynb") +
@("$tmpDir", ".github", "Docker", "docs", ".gitignore", ".dockerignore", "README.md")
Remove-Item $items -Force -Recurse -ErrorAction SilentlyContinue
$curr = Get-Location
Set-Location ../
Get-ChildItem .
Copy-Item -Path $curr -Destination $pkgName -Recurse
$7zPath = "$pkgName.7z"
$start = Get-Date
Write-Host "Compress Starting at $start"
& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
$end = Get-Date
Write-Host "Elapsed time: $($end - $start)"
Get-ChildItem .
python -m pip install --upgrade pip
python -m pip install "modelscope" "huggingface_hub[hf_transfer]" --no-warn-script-location
Write-Host "[INFO] Uploading to ModelScope..."
$msUser = $env:MODELSCOPE_USERNAME
$msToken = $env:MODELSCOPE_TOKEN
if (-not $msUser -or -not $msToken) {
Write-Error "Missing MODELSCOPE_USERNAME or MODELSCOPE_TOKEN"
exit 1
}
modelscope upload "$msUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $msToken
Write-Host "[SUCCESS] Uploaded: $7zPath to ModelScope"
Write-Host "[INFO] Uploading to HuggingFace..."
$hfUser = $env:HUGGINGFACE_USERNAME
$hfToken = $env:HUGGINGFACE_TOKEN
if (-not $hfUser -or -not $hfToken) {
Write-Error "Missing HUGGINGFACE_USERNAME or HUGGINGFACE_TOKEN"
exit 1
}
$env:HF_HUB_ENABLE_HF_TRANSFER = "1"
huggingface-cli upload "$hfUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $hfToken
Write-Host "[SUCCESS] Uploaded: $7zPath to HuggingFace"
================================================
FILE: .github/workflows/build_windows_packages.yaml
================================================
name: Build and Upload Windows Package
on:
workflow_dispatch:
inputs:
date:
description: "Date suffix (optional)"
required: false
default: ""
suffix:
description: "Package name suffix (optional)"
required: false
default: ""
jobs:
build:
runs-on: windows-latest
strategy:
matrix:
torch_cuda: [cu124, cu128]
env:
TORCH_CUDA: ${{ matrix.torch_cuda }}
MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }}
MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }}
HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }}
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
DATE_SUFFIX: ${{ github.event.inputs.date }}
PKG_SUFFIX: ${{ github.event.inputs.suffix }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Run Build and Upload Script
shell: pwsh
run: |
Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1
../build_windows_packages.ps1
================================================
FILE: .github/workflows/docker-publish.yaml
================================================
name: Build and Publish Docker Image
on:
workflow_dispatch:
jobs:
generate-meta:
runs-on: ubuntu-22.04
outputs:
tag: ${{ steps.meta.outputs.tag }}
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Generate Tag
id: meta
run: |
DATE=$(date +'%Y%m%d')
COMMIT=$(git rev-parse --short=6 HEAD)
echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT
build-amd64:
needs: generate-meta
runs-on: ubuntu-22.04
environment: Docker
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (amd64)
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/amd64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64
build-arm64:
needs: generate-meta
runs-on: ubuntu-22.04-arm
environment: Docker
strategy:
matrix:
include:
- cuda_version: 12.6
lite: true
torch_base: lite
tag_prefix: cu126-lite
- cuda_version: 12.6
lite: false
torch_base: full
tag_prefix: cu126
- cuda_version: 12.8
lite: true
torch_base: lite
tag_prefix: cu128-lite
- cuda_version: 12.8
lite: false
torch_base: full
tag_prefix: cu128
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Free up disk space
run: |
echo "Before cleanup:"
df -h
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
sudo rm -rf /opt/hostedtoolcache/PyPy
sudo rm -rf /opt/hostedtoolcache/go
sudo rm -rf /opt/hostedtoolcache/node
sudo rm -rf /opt/hostedtoolcache/Ruby
sudo rm -rf /opt/microsoft
sudo rm -rf /opt/pipx
sudo rm -rf /opt/az
sudo rm -rf /opt/google
sudo rm -rf /usr/lib/jvm
sudo rm -rf /usr/lib/google-cloud-sdk
sudo rm -rf /usr/lib/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/local/.ghcup
sudo rm -rf /usr/local/julia1.11.5
sudo rm -rf /usr/local/share/powershell
sudo rm -rf /usr/local/share/chromium
sudo rm -rf /usr/share/swift
sudo rm -rf /usr/share/miniconda
sudo rm -rf /usr/share/az_12.1.0
sudo rm -rf /usr/share/dotnet
echo "After cleanup:"
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Build and Push Docker Image (arm64)
uses: docker/build-push-action@v5
with:
context: .
file: ./Dockerfile
push: true
platforms: linux/arm64
build-args: |
LITE=${{ matrix.lite }}
TORCH_BASE=${{ matrix.torch_base }}
CUDA_VERSION=${{ matrix.cuda_version }}
WORKFLOW=true
tags: |
xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64
xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64
merge-and-clean:
needs:
- build-amd64
- build-arm64
- generate-meta
runs-on: ubuntu-latest
strategy:
matrix:
include:
- tag_prefix: cu126-lite
- tag_prefix: cu126
- tag_prefix: cu128-lite
- tag_prefix: cu128
environment: Docker
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Merge amd64 and arm64 into multi-arch image
run: |
DATE_TAG=${{ needs.generate-meta.outputs.tag }}
TAG_PREFIX=${{ matrix.tag_prefix }}
docker buildx imagetools create \
--tag ${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \
${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \
${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64
docker buildx imagetools create \
--tag ${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \
${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \
${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64
- name: Delete old platform-specific tags via Docker Hub API
env:
DOCKER_HUB_USERNAME: ${{ vars.DOCKER_HUB_USERNAME }}
DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }}
TAG_PREFIX: ${{ matrix.tag_prefix }}
DATE_TAG: ${{ needs.generate-meta.outputs.tag }}
run: |
sudo apt-get update && sudo apt-get install -y jq
TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \
"https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \
| jq -r .token)
for PLATFORM in amd64 arm64; do
SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g')
TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}"
LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}"
for DEL_TAG in "$TAG" "$LATEST_TAG"; do
echo "Deleting tag: $DEL_TAG"
curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG
done
done
create-default:
runs-on: ubuntu-latest
needs:
- merge-and-clean
environment: Docker
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_PASSWORD }}
- name: Create Default Tag
run: |
docker buildx imagetools create \
--tag ${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \
${{ vars.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite
================================================
FILE: .gitignore
================================================
.DS_Store
.vscode
__pycache__
*.pyc
env
runtime
.idea
output
logs
SoVITS_weights*/
GPT_weights*/
TEMP
weight.json
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
tools/AP_BWE_main/24kto48k/*
!tools/AP_BWE_main/24kto48k/readme.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
================================================
FILE: .pre-commit-config.yaml
================================================
ci:
autoupdate_schedule: monthly
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
# Run the linter.
- id: ruff
types_or: [ python, pyi ]
args: [ --fix , "--exit-zero" ]
# Run the formatter.
- id: ruff-format
types_or: [ python, pyi ]
args: [ --line-length, "120", --target-version, "py311" ]
================================================
FILE: Colab-Inference.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# GPT-SoVITS Infer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Env Setup (Run Once Only)\n",
"## 环境配置, 只需运行一次"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e9b7iFV3dm1f"
},
"outputs": [],
"source": [
"%%writefile /content/setup.sh\n",
"set -e\n",
"\n",
"cd /content\n",
"\n",
"git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"\n",
"cd GPT-SoVITS\n",
"\n",
"mkdir -p GPT_weights\n",
"\n",
"mkdir -p SoVITS_weights\n",
"\n",
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
" :\n",
"else\n",
" conda create -n GPTSoVITS python=3.10 -y\n",
"fi\n",
"\n",
"source activate GPTSoVITS\n",
"\n",
"pip install ipykernel\n",
"\n",
"bash install.sh --device CU126 --source HF"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [
"%pip install -q condacolab\n",
"import condacolab\n",
"condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n",
"!cd /content && bash setup.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download From HuggingFace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [
"# Modify These\n",
"USER_ID = \"AkitoP\"\n",
"REPO_NAME = \"GPT-SoVITS-v2-aegi\"\n",
"BRANCH = \"main\"\n",
"GPT_PATH = \"new_aegigoe-e100.ckpt\"\n",
"SOVITS_PATH = \"new_aegigoe_e60_s32220.pth\"\n",
"\n",
"# Do Not Modify\n",
"HF_BASE = \"https://huggingface.co\"\n",
"REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n",
"GPT_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{GPT_PATH}\"\n",
"SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/blob/{BRANCH}/{SOVITS_PATH}\"\n",
"\n",
"!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n",
"!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download From ModelScope"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Modify These\n",
"USER_ID = \"aihobbyist\"\n",
"REPO_NAME = \"GPT-SoVits-V2-models\"\n",
"BRANCH = \"master\"\n",
"GPT_PATH = \"Genshin_Impact/EN/GPT_GenshinImpact_EN_5.1.ckpt\"\n",
"SOVITS_PATH = \"Wuthering_Waves/CN/SV_WutheringWaves_CN_1.3.pth\"\n",
"\n",
"# Do Not Modify\n",
"HF_BASE = \"https://www.modelscope.cn/models\"\n",
"REPO_ID = f\"{USER_ID}/{REPO_NAME}\"\n",
"GPT_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{GPT_PATH}\"\n",
"SOVITS_URL = f\"{HF_BASE}/{REPO_ID}/resolve/{BRANCH}/{SOVITS_PATH}\"\n",
"\n",
"!cd \"/content/GPT-SoVITS/GPT_weights\" && wget \"{GPT_URL}\"\n",
"!cd \"/content/GPT-SoVITS/SoVITS_weights\" && wget \"{SOVITS_URL}\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Launch WebUI\n",
"# 启动 WebUI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
================================================
FILE: Colab-WebUI.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# GPT-SoVITS WebUI"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_o6a8GS2lWQM"
},
"source": [
"## Env Setup (Run Once Only)\n",
"## 环境配置, 只需运行一次"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile /content/setup.sh\n",
"set -e\n",
"\n",
"cd /content\n",
"\n",
"git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"\n",
"cd GPT-SoVITS\n",
"\n",
"if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
" :\n",
"else\n",
" conda create -n GPTSoVITS python=3.10 -y\n",
"fi\n",
"\n",
"source activate GPTSoVITS\n",
"\n",
"pip install ipykernel\n",
"\n",
"bash install.sh --device CU126 --source HF --download-uvr5"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -q condacolab\n",
"import condacolab\n",
"condacolab.install_from_url(\"https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh\")\n",
"!cd /content && bash setup.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Launch WebUI\n",
"## 启动 WebUI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
================================================
FILE: Docker/install_wrapper.sh
================================================
#!/bin/bash
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1
cd .. || exit 1
set -e
source "$HOME/conda/etc/profile.d/conda.sh"
mkdir -p GPT_SoVITS
mkdir -p GPT_SoVITS/text
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
TERM=dumb bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
pip cache purge
pip show torch
rm -rf /tmp/* /var/tmp/*
rm -rf "$HOME/conda/pkgs"
mkdir -p "$HOME/conda/pkgs"
rm -rf /root/.conda /root/.cache
================================================
FILE: Docker/miniforge_install.sh
================================================
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
cd "$SCRIPT_DIR" || exit 1
cd .. || exit 1
if [ -d "$HOME/conda" ]; then
exit 0
fi
WORKFLOW=${WORKFLOW:-"false"}
TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"}
if [ "$WORKFLOW" = "true" ]; then
WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
else
WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404)
fi
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
"${WGET_CMD[@]}" -O Miniforge.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
SYSROOT_PKG="sysroot_linux-64>=2.28"
elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then
"${WGET_CMD[@]}" -O Miniforge.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
SYSROOT_PKG="sysroot_linux-aarch64>=2.28"
else
exit 1
fi
LOG_PATH="/tmp/miniforge-install.log"
bash Miniforge.sh -b -p "$HOME/conda" >"$LOG_PATH" 2>&1
if [ $? -eq 0 ]; then
echo "== Miniforge Installed =="
else
echo "Failed to Install miniforge"
tail -n 50 "$LOG_PATH"
exit 1
fi
rm Miniforge.sh
source "$HOME/conda/etc/profile.d/conda.sh"
"$HOME/conda/bin/conda" init bash
source "$HOME/.bashrc"
"$HOME/conda/bin/conda" info
"$HOME/conda/bin/conda" update --all -y
"$HOME/conda/bin/conda" install python=3.12 -y
"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
if [ "$CUDA_VERSION" = "12.8" ]; then
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
"$HOME/conda/bin/conda" install cuda-nvcc=12.8 -y
elif [ "$CUDA_VERSION" = "12.6" ]; then
"$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
"$HOME/conda/bin/conda" install cuda-nvcc=12.6 -y
fi
export PATH="$HOME/conda/bin:$PATH"
"$HOME/conda/bin/pip" install psutil ninja packaging wheel "setuptools>=42" einops
"$HOME/conda/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
"$HOME/conda/bin/pip" cache purge
rm $LOG_PATH
rm -rf "$HOME/conda/pkgs"
mkdir -p "$HOME/conda/pkgs"
rm -rf "$HOME/.conda" "$HOME/.cache"
================================================
FILE: Dockerfile
================================================
ARG CUDA_VERSION=12.6
ARG TORCH_BASE=full
FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
LABEL maintainer="XXXXRT"
LABEL version="V2 Pro"
LABEL description="Docker image for GPT-SoVITS"
ARG CUDA_VERSION=12.6
ENV CUDA_VERSION=${CUDA_VERSION}
SHELL ["/bin/bash", "-c"]
WORKDIR /workspace/GPT-SoVITS
COPY Docker /workspace/GPT-SoVITS/Docker/
ARG LITE=false
ENV LITE=${LITE}
ARG WORKFLOW=false
ENV WORKFLOW=${WORKFLOW}
ARG TARGETPLATFORM
ENV TARGETPLATFORM=${TARGETPLATFORM}
COPY extra-req.txt /workspace/GPT-SoVITS/
COPY requirements.txt /workspace/GPT-SoVITS/
COPY install.sh /workspace/GPT-SoVITS/
RUN bash Docker/install_wrapper.sh
EXPOSE 9871 9872 9873 9874 9880
ENV PYTHONPATH="/workspace/GPT-SoVITS"
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
WORKDIR /workspace
RUN rm -rf /workspace/GPT-SoVITS
WORKDIR /workspace/GPT-SoVITS
COPY . /workspace/GPT-SoVITS
CMD ["/bin/bash", "-c", "\
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
exec bash"]
================================================
FILE: GPT_SoVITS/AR/__init__.py
================================================
================================================
FILE: GPT_SoVITS/AR/data/__init__.py
================================================
================================================
FILE: GPT_SoVITS/AR/data/bucket_sampler.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py
# reference: https://github.com/lifeiteng/vall-e
import itertools
import math
import random
from random import shuffle
from typing import Iterator, Optional, TypeVar
import torch
import torch.distributed as dist
from torch.utils.data import Dataset, Sampler
__all__ = [
"DistributedBucketSampler",
]
T_co = TypeVar("T_co", covariant=True)
class DistributedBucketSampler(Sampler[T_co]):
r"""
sort the dataset wrt. input length
divide samples into buckets
sort within buckets
divide buckets into batches
sort batches
"""
def __init__(
self,
dataset: Dataset,
num_replicas: Optional[int] = None,
rank: Optional[int] = None,
shuffle: bool = True,
seed: int = 0,
drop_last: bool = False,
batch_size: int = 32,
) -> None:
if num_replicas is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
num_replicas = dist.get_world_size() if torch.cuda.is_available() else 1
if rank is None:
if not dist.is_available():
raise RuntimeError("Requires distributed package to be available")
rank = dist.get_rank() if torch.cuda.is_available() else 0
if torch.cuda.is_available():
torch.cuda.set_device(rank)
if rank >= num_replicas or rank < 0:
raise ValueError("Invalid rank {}, rank should be in the interval [0, {}]".format(rank, num_replicas - 1))
self.dataset = dataset
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.drop_last = drop_last
# If the dataset length is evenly divisible by # of replicas, then there
# is no need to drop any data, since the dataset will be split equally.
if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type]
# Split to nearest available length that is evenly divisible.
# This is to ensure each rank receives the same amount of data when
# using this Sampler.
self.num_samples = math.ceil(
(len(self.dataset) - self.num_replicas) / self.num_replicas, # type: ignore[arg-type]
)
else:
self.num_samples = math.ceil(
len(self.dataset) / self.num_replicas,
) # type: ignore[arg-type]
self.total_size = self.num_samples * self.num_replicas
self.shuffle = shuffle
self.seed = seed
self.batch_size = batch_size
self.id_with_length = self._get_sample_lengths()
self.id_buckets = self.make_buckets(bucket_width=2.0)
def _get_sample_lengths(self):
id_with_lengths = []
for i in range(len(self.dataset)):
id_with_lengths.append((i, self.dataset.get_sample_length(i)))
id_with_lengths.sort(key=lambda x: x[1])
return id_with_lengths
def make_buckets(self, bucket_width: float = 2.0):
buckets = []
cur = []
max_sec = bucket_width
for id, sec in self.id_with_length:
if sec < max_sec:
cur.append(id)
else:
buckets.append(cur)
cur = [id]
max_sec += bucket_width
if len(cur) > 0:
buckets.append(cur)
return buckets
def __iter__(self) -> Iterator[T_co]:
if self.shuffle:
# deterministically shuffle based on epoch and seed
g = torch.Generator()
g.manual_seed(self.seed + self.epoch)
random.seed(self.epoch + self.seed)
shuffled_bucket = []
for buc in self.id_buckets:
buc_copy = buc.copy()
shuffle(buc_copy)
shuffled_bucket.append(buc_copy)
grouped_batch_size = self.batch_size * self.num_replicas
shuffled_bucket = list(itertools.chain(*shuffled_bucket))
n_batch = int(math.ceil(len(shuffled_bucket) / grouped_batch_size))
batches = [shuffled_bucket[b * grouped_batch_size : (b + 1) * grouped_batch_size] for b in range(n_batch)]
shuffle(batches)
indices = list(itertools.chain(*batches))
else:
# type: ignore[arg-type]
indices = list(range(len(self.dataset)))
if not self.drop_last:
# add extra samples to make it evenly divisible
padding_size = self.total_size - len(indices)
if padding_size <= len(indices):
indices += indices[:padding_size]
else:
indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
else:
# remove tail of data to make it evenly divisible.
indices = indices[: self.total_size]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank : self.total_size : self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self) -> int:
return self.num_samples
def set_epoch(self, epoch: int) -> None:
r"""
Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
use a different random ordering for each epoch. Otherwise, the next iteration of this
sampler will yield the same ordering.
Args:
epoch (int): Epoch number.
"""
self.epoch = epoch
================================================
FILE: GPT_SoVITS/AR/data/data_module.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py
# reference: https://github.com/lifeiteng/vall-e
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader
from AR.data.bucket_sampler import DistributedBucketSampler
from AR.data.dataset import Text2SemanticDataset
class Text2SemanticDataModule(LightningDataModule):
def __init__(
self,
config,
train_semantic_path,
train_phoneme_path,
dev_semantic_path=None,
dev_phoneme_path=None,
):
super().__init__()
self.config = config
self.train_semantic_path = train_semantic_path
self.train_phoneme_path = train_phoneme_path
self.dev_semantic_path = dev_semantic_path
self.dev_phoneme_path = dev_phoneme_path
self.num_workers = self.config["data"]["num_workers"]
def prepare_data(self):
pass
def setup(self, stage=None, output_logs=False):
self._train_dataset = Text2SemanticDataset(
phoneme_path=self.train_phoneme_path,
semantic_path=self.train_semantic_path,
max_sec=self.config["data"]["max_sec"],
pad_val=self.config["data"]["pad_val"],
)
self._dev_dataset = self._train_dataset
# self._dev_dataset = Text2SemanticDataset(
# phoneme_path=self.dev_phoneme_path,
# semantic_path=self.dev_semantic_path,
# max_sample=self.config['data']['max_eval_sample'],
# max_sec=self.config['data']['max_sec'],
# pad_val=self.config['data']['pad_val'])
def train_dataloader(self):
batch_size = (
self.config["train"]["batch_size"] // 2
if self.config["train"].get("if_dpo", False) is True
else self.config["train"]["batch_size"]
)
batch_size = max(min(batch_size, len(self._train_dataset) // 4), 1) # 防止不保存
sampler = DistributedBucketSampler(self._train_dataset, batch_size=batch_size)
return DataLoader(
self._train_dataset,
batch_size=batch_size,
sampler=sampler,
collate_fn=self._train_dataset.collate,
num_workers=self.num_workers,
persistent_workers=True,
prefetch_factor=16,
)
def val_dataloader(self):
return DataLoader(
self._dev_dataset,
batch_size=1,
shuffle=False,
collate_fn=self._train_dataset.collate,
num_workers=max(self.num_workers, 12),
persistent_workers=True,
prefetch_factor=16,
)
# 这个会使用到嘛?
def test_dataloader(self):
return DataLoader(
self._dev_dataset,
batch_size=1,
shuffle=False,
collate_fn=self._train_dataset.collate,
)
================================================
FILE: GPT_SoVITS/AR/data/dataset.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py
# reference: https://github.com/lifeiteng/vall-e
# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert")
import os
import traceback
from typing import Dict, List
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
version = os.environ.get("version", None)
from text import cleaned_text_to_sequence
# from config import exp_dir
def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value: int = 0):
seq = sequences[0]
ndim = seq.ndim
if axis < 0:
axis += ndim
dtype = seq.dtype
pad_value = dtype.type(pad_value)
seq_lengths = [seq.shape[axis] for seq in sequences]
max_length = np.max(seq_lengths)
padded_sequences = []
for seq, length in zip(sequences, seq_lengths):
padding = [(0, 0)] * axis + [(0, max_length - length)] + [(0, 0)] * (ndim - axis - 1)
padded_seq = np.pad(seq, padding, mode="constant", constant_values=pad_value)
padded_sequences.append(padded_seq)
batch = np.stack(padded_sequences)
return batch
class Text2SemanticDataset(Dataset):
"""dataset class for text tokens to semantic model training."""
def __init__(
self,
phoneme_path: str,
semantic_path: str,
max_sample: int = None,
max_sec: int = 100,
pad_val: int = 1024,
# min value of phoneme/sec
min_ps_ratio: int = 3,
# max value of phoneme/sec
max_ps_ratio: int = 25,
) -> None:
super().__init__()
self.semantic_data = pd.read_csv(
semantic_path,
delimiter="\t",
encoding="utf-8",
)
# get dict
self.path2 = phoneme_path # "%s/2-name2text.txt"%exp_dir#phoneme_path
self.path3 = "%s/3-bert" % (
os.path.dirname(
phoneme_path,
)
) # "%s/3-bert"%exp_dir#bert_dir
self.path6 = semantic_path # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
assert os.path.exists(self.path2)
assert os.path.exists(self.path6)
self.phoneme_data = {}
with open(self.path2, "r", encoding="utf8") as f:
lines = f.read().strip("\n").split("\n")
for line in lines:
tmp = line.split("\t")
if len(tmp) != 4:
continue
self.phoneme_data[tmp[0]] = [tmp[1], tmp[2], tmp[3]]
# self.phoneme_data = np.load(phoneme_path, allow_pickle=True).item()
# pad for semantic tokens
self.PAD: int = pad_val
# self.hz = 25
# with open("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert/configs/s2.json", "r") as f:data = f.read()
# data=json.loads(data)["model"]["semantic_frame_rate"]#50hz
# self.hz=int(data[:-2])#
self.hz = int(os.environ.get("hz", "25hz")[:-2])
# max seconds of semantic token
self.max_sec = max_sec
self.min_ps_ratio = min_ps_ratio
self.max_ps_ratio = max_ps_ratio
if max_sample is not None:
self.semantic_data = self.semantic_data[:max_sample]
# {idx: (semantic, phoneme)}
# semantic list, phoneme list
self.semantic_phoneme = []
self.item_names = []
self.inited = False
if not self.inited:
# 调用初始化函数
self.init_batch()
self.inited = True
del self.semantic_data
del self.phoneme_data
# self.tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
# self.tokenizer = AutoTokenizer.from_pretrained("/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large")
def init_batch(self):
semantic_data_len = len(self.semantic_data)
phoneme_data_len = len(self.phoneme_data.keys())
print("semantic_data_len:", semantic_data_len)
print("phoneme_data_len:", phoneme_data_len)
print(self.semantic_data)
idx = 0
num_not_in = 0
num_deleted_bigger = 0
num_deleted_ps = 0
for i in range(semantic_data_len):
# 先依次遍历
# get str
item_name = self.semantic_data.iloc[i, 0]
# print(self.phoneme_data)
try:
phoneme, word2ph, text = self.phoneme_data[item_name]
except Exception:
traceback.print_exc()
# print(f"{item_name} not in self.phoneme_data !")
num_not_in += 1
continue
semantic_str = self.semantic_data.iloc[i, 1]
# get token list
semantic_ids = [int(idx) for idx in semantic_str.split(" ")]
# (T), 是否需要变成 (1, T) -> 不需要,因为需要求 len
# 过滤掉太长的样本
if (
len(semantic_ids) > self.max_sec * self.hz
): #########1###根据token个数推测总时长过滤时长60s(config里)#40*25=1k
num_deleted_bigger += 1
continue
# (T, ), 这个速度不会很慢,所以可以在一开始就处理,无需在 __getitem__ 里面单个处理####
phoneme = phoneme.split(" ")
try:
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except:
traceback.print_exc()
# print(f"{item_name} not in self.phoneme_data !")
num_not_in += 1
continue
# if len(phoneme_ids) >400:###########2:改为恒定限制为semantic/2.5就行
if len(phoneme_ids) > self.max_sec * self.hz / 2.5: ###########2:改为恒定限制为semantic/2.5就行
num_deleted_ps += 1
continue
# if len(semantic_ids) > 1000:###########3
# num_deleted_bigger += 1
# continue
ps_ratio = len(phoneme_ids) / (len(semantic_ids) / self.hz)
if ps_ratio > self.max_ps_ratio or ps_ratio < self.min_ps_ratio: ##########4#3~25#每秒多少个phone
num_deleted_ps += 1
# print(item_name)
continue
self.semantic_phoneme.append((semantic_ids, phoneme_ids))
idx += 1
self.item_names.append(item_name)
min_num = 100 # 20直接不补#30补了也不存ckpt
leng = len(self.semantic_phoneme)
if leng < min_num:
tmp1 = self.semantic_phoneme
tmp2 = self.item_names
self.semantic_phoneme = []
self.item_names = []
for _ in range(max(2, int(min_num / leng))):
self.semantic_phoneme += tmp1
self.item_names += tmp2
if num_not_in > 0:
print(f"there are {num_not_in} semantic datas not in phoneme datas")
if num_deleted_bigger > 0:
print(
f"deleted {num_deleted_bigger} audios who's duration are bigger than {self.max_sec} seconds",
)
if num_deleted_ps > 0:
# 4702 for LibriTTS, LirbriTTS 是标注数据, 是否需要筛?=> 需要,有值为 100 的极端值
print(
f"deleted {num_deleted_ps} audios who's phoneme/sec are bigger than {self.max_ps_ratio} or smaller than {self.min_ps_ratio}",
)
"""
there are 31 semantic datas not in phoneme datas
deleted 34 audios who's duration are bigger than 54 seconds
deleted 3190 audios who's phoneme/sec are bigger than 25 or smaller than 3
dataset.__len__(): 366463
"""
# 345410 for LibriTTS
print("dataset.__len__():", self.__len__())
def __get_item_names__(self) -> List[str]:
return self.item_names
def __len__(self) -> int:
return len(self.semantic_phoneme)
def __getitem__(self, idx: int) -> Dict:
semantic_ids, phoneme_ids = self.semantic_phoneme[idx]
item_name = self.item_names[idx]
phoneme_ids_len = len(phoneme_ids)
# semantic tokens target
semantic_ids_len = len(semantic_ids)
flag = 0
path_bert = "%s/%s.pt" % (self.path3, item_name)
if os.path.exists(path_bert) == True:
bert_feature = torch.load(path_bert, map_location="cpu")
else:
flag = 1
if flag == 1:
# bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
bert_feature = None
else:
assert bert_feature.shape[-1] == len(phoneme_ids)
return {
"idx": idx,
"phoneme_ids": phoneme_ids,
"phoneme_ids_len": phoneme_ids_len,
"semantic_ids": semantic_ids,
"semantic_ids_len": semantic_ids_len,
"bert_feature": bert_feature,
}
def get_sample_length(self, idx: int):
semantic_ids = self.semantic_phoneme[idx][0]
sec = 1.0 * len(semantic_ids) / self.hz
return sec
def collate(self, examples: List[Dict]) -> Dict:
sample_index: List[int] = []
phoneme_ids: List[torch.Tensor] = []
phoneme_ids_lens: List[int] = []
semantic_ids: List[torch.Tensor] = []
semantic_ids_lens: List[int] = []
# return
for item in examples:
sample_index.append(item["idx"])
phoneme_ids.append(np.array(item["phoneme_ids"], dtype=np.int64))
semantic_ids.append(np.array(item["semantic_ids"], dtype=np.int64))
phoneme_ids_lens.append(item["phoneme_ids_len"])
semantic_ids_lens.append(item["semantic_ids_len"])
# pad 0
phoneme_ids = batch_sequences(phoneme_ids)
semantic_ids = batch_sequences(semantic_ids, pad_value=self.PAD)
# # convert each batch to torch.tensor
phoneme_ids = torch.tensor(phoneme_ids)
semantic_ids = torch.tensor(semantic_ids)
phoneme_ids_lens = torch.tensor(phoneme_ids_lens)
semantic_ids_lens = torch.tensor(semantic_ids_lens)
bert_padded = torch.FloatTensor(len(examples), 1024, max(phoneme_ids_lens))
bert_padded.zero_()
for idx, item in enumerate(examples):
bert = item["bert_feature"]
if bert != None:
bert_padded[idx, :, : bert.shape[-1]] = bert
return {
# List[int]
"ids": sample_index,
# torch.Tensor (B, max_phoneme_length)
"phoneme_ids": phoneme_ids,
# torch.Tensor (B)
"phoneme_ids_len": phoneme_ids_lens,
# torch.Tensor (B, max_semantic_ids_length)
"semantic_ids": semantic_ids,
# torch.Tensor (B)
"semantic_ids_len": semantic_ids_lens,
# torch.Tensor (B, 1024, max_phoneme_length)
"bert_feature": bert_padded,
}
if __name__ == "__main__":
root_dir = "/data/docker/liujing04/gpt-vits/prepare/dump_mix/"
dataset = Text2SemanticDataset(
phoneme_path=root_dir + "phoneme_train.npy",
semantic_path=root_dir + "semantic_train.tsv",
)
batch_size = 12
dataloader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=dataset.collate,
shuffle=False,
)
for i, batch in enumerate(dataloader):
if i % 1000 == 0:
print(i)
# if i == 0:
# print('batch["ids"]:', batch["ids"])
# print('batch["phoneme_ids"]:', batch["phoneme_ids"],
# batch["phoneme_ids"].shape)
# print('batch["phoneme_ids_len"]:', batch["phoneme_ids_len"],
# batch["phoneme_ids_len"].shape)
# print('batch["semantic_ids"]:', batch["semantic_ids"],
# batch["semantic_ids"].shape)
# print('batch["semantic_ids_len"]:', batch["semantic_ids_len"],
# batch["semantic_ids_len"].shape)
================================================
FILE: GPT_SoVITS/AR/models/__init__.py
================================================
================================================
FILE: GPT_SoVITS/AR/models/t2s_lightning_module.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
# reference: https://github.com/lifeiteng/vall-e
import os
import sys
now_dir = os.getcwd()
sys.path.append(now_dir)
from typing import Dict
import torch
from pytorch_lightning import LightningModule
from AR.models.t2s_model import Text2SemanticDecoder
from AR.modules.lr_schedulers import WarmupCosineLRSchedule
from AR.modules.optim import ScaledAdam
class Text2SemanticLightningModule(LightningModule):
def __init__(self, config, output_dir, is_train=True):
super().__init__()
self.config = config
self.top_k = 3
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
pretrained_s1 = config.get("pretrained_s1")
if pretrained_s1 and is_train:
# print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
print(
self.load_state_dict(
torch.load(
pretrained_s1,
map_location="cpu",
weights_only=False,
)["weight"],
)
)
if is_train:
self.automatic_optimization = False
self.save_hyperparameters()
self.eval_dir = output_dir / "eval"
self.eval_dir.mkdir(parents=True, exist_ok=True)
def training_step(self, batch: Dict, batch_idx: int):
opt = self.optimizers()
scheduler = self.lr_schedulers()
forward = self.model.forward if self.config["train"].get("if_dpo", False) == True else self.model.forward_old
loss, acc = forward(
batch["phoneme_ids"],
batch["phoneme_ids_len"],
batch["semantic_ids"],
batch["semantic_ids_len"],
batch["bert_feature"],
)
self.manual_backward(loss)
if batch_idx > 0 and batch_idx % 4 == 0:
opt.step()
opt.zero_grad()
scheduler.step()
self.log(
"total_loss",
loss,
on_step=True,
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
self.log(
"lr",
scheduler.get_last_lr()[0],
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
self.log(
f"top_{self.top_k}_acc",
acc,
on_step=True,
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
def validation_step(self, batch: Dict, batch_idx: int):
return
# # get loss
# loss, acc = self.model.forward(
# batch['phoneme_ids'], batch['phoneme_ids_len'],
# batch['semantic_ids'], batch['semantic_ids_len'],
# batch['bert_feature']
# )
#
# self.log(
# "val_total_loss",
# loss,
# on_step=True,
# on_epoch=True,
# prog_bar=True,
# sync_dist=True)
# self.log(
# f"val_top_{self.top_k}_acc",
# acc,
# on_step=True,
# on_epoch=True,
# prog_bar=True,
# sync_dist=True)
#
# # get infer output
# semantic_len = batch['semantic_ids'].size(1)
# prompt_len = min(int(semantic_len * 0.5), 150)
# prompt = batch['semantic_ids'][:, :prompt_len]
# pred_semantic = self.model.infer(batch['phoneme_ids'],
# batch['phoneme_ids_len'], prompt,
# batch['bert_feature']
# )
# save_name = f'semantic_toks_{batch_idx}.pt'
# save_path = os.path.join(self.eval_dir, save_name)
# torch.save(pred_semantic.detach().cpu(), save_path)
def configure_optimizers(self):
model_parameters = self.model.parameters()
parameters_names = []
parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()])
lm_opt = ScaledAdam(
model_parameters,
lr=0.01,
betas=(0.9, 0.95),
clipping_scale=2.0,
parameters_names=parameters_names,
show_dominant_parameters=False,
clipping_update_period=1000,
)
return {
"optimizer": lm_opt,
"lr_scheduler": {
"scheduler": WarmupCosineLRSchedule(
lm_opt,
init_lr=self.config["optimizer"]["lr_init"],
peak_lr=self.config["optimizer"]["lr"],
end_lr=self.config["optimizer"]["lr_end"],
warmup_steps=self.config["optimizer"]["warmup_steps"],
total_steps=self.config["optimizer"]["decay_steps"],
)
},
}
================================================
FILE: GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py
# reference: https://github.com/lifeiteng/vall-e
import os
import sys
now_dir = os.getcwd()
sys.path.append(now_dir)
from typing import Dict
import torch
from pytorch_lightning import LightningModule
from AR.models.t2s_model_onnx import Text2SemanticDecoder
from AR.modules.lr_schedulers import WarmupCosineLRSchedule
from AR.modules.optim import ScaledAdam
class Text2SemanticLightningModule(LightningModule):
def __init__(self, config, output_dir, is_train=True):
super().__init__()
self.config = config
self.top_k = 3
self.model = Text2SemanticDecoder(config=config, top_k=self.top_k)
pretrained_s1 = config.get("pretrained_s1")
if pretrained_s1 and is_train:
# print(self.load_state_dict(torch.load(pretrained_s1,map_location="cpu")["state_dict"]))
print(
self.load_state_dict(
torch.load(
pretrained_s1,
map_location="cpu",
)["weight"],
),
)
if is_train:
self.automatic_optimization = False
self.save_hyperparameters()
self.eval_dir = output_dir / "eval"
self.eval_dir.mkdir(parents=True, exist_ok=True)
def training_step(self, batch: Dict, batch_idx: int):
opt = self.optimizers()
scheduler = self.lr_schedulers()
loss, acc = self.model.forward(
batch["phoneme_ids"],
batch["phoneme_ids_len"],
batch["semantic_ids"],
batch["semantic_ids_len"],
batch["bert_feature"],
)
self.manual_backward(loss)
if batch_idx > 0 and batch_idx % 4 == 0:
opt.step()
opt.zero_grad()
scheduler.step()
self.log(
"total_loss",
loss,
on_step=True,
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
self.log(
"lr",
scheduler.get_last_lr()[0],
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
self.log(
f"top_{self.top_k}_acc",
acc,
on_step=True,
on_epoch=True,
prog_bar=True,
sync_dist=True,
)
def validation_step(self, batch: Dict, batch_idx: int):
return
def configure_optimizers(self):
model_parameters = self.model.parameters()
parameters_names = []
parameters_names.append([name_param_pair[0] for name_param_pair in self.model.named_parameters()])
lm_opt = ScaledAdam(
model_parameters,
lr=0.01,
betas=(0.9, 0.95),
clipping_scale=2.0,
parameters_names=parameters_names,
show_dominant_parameters=False,
clipping_update_period=1000,
)
return {
"optimizer": lm_opt,
"lr_scheduler": {
"scheduler": WarmupCosineLRSchedule(
lm_opt,
init_lr=self.config["optimizer"]["lr_init"],
peak_lr=self.config["optimizer"]["lr"],
end_lr=self.config["optimizer"]["lr_end"],
warmup_steps=self.config["optimizer"]["warmup_steps"],
total_steps=self.config["optimizer"]["decay_steps"],
)
},
}
================================================
FILE: GPT_SoVITS/AR/models/t2s_model.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
# reference: https://github.com/lifeiteng/vall-e
import math
from typing import List, Optional
import torch
from torch import nn
from torch.nn import functional as F
from torchmetrics.classification import MulticlassAccuracy
from tqdm import tqdm
from AR.models.utils import (
dpo_loss,
get_batch_logps,
make_pad_mask,
make_pad_mask_left,
make_reject_y,
sample,
topk_sampling,
)
from AR.modules.embedding import SinePositionalEmbedding, TokenEmbedding
from AR.modules.transformer import LayerNorm, TransformerEncoder, TransformerEncoderLayer
default_config = {
"embedding_dim": 512,
"hidden_dim": 512,
"num_head": 8,
"num_layers": 12,
"num_codebook": 8,
"p_dropout": 0.0,
"vocab_size": 1024 + 1,
"phoneme_vocab_size": 512,
"EOS": 1024,
}
# @torch.jit.script ## 使用的话首次推理会非常慢,而且推理速度不稳定
# Efficient implementation equivalent to the following:
def scaled_dot_product_attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
B, H, L, S = query.size(0), query.size(1), query.size(-2), key.size(-2)
if scale is None:
scale_factor = torch.tensor(1 / math.sqrt(query.size(-1)))
else:
scale_factor = scale
attn_bias = torch.zeros(B, H, L, S, dtype=query.dtype, device=query.device)
if attn_mask is not None:
if attn_mask.dtype == torch.bool:
attn_bias.masked_fill_(attn_mask, float("-inf"))
else:
attn_bias += attn_mask
attn_weight = query @ key.transpose(-2, -1) * scale_factor
attn_weight += attn_bias
attn_weight = torch.softmax(attn_weight, dim=-1)
if attn_mask is not None:
if attn_mask.dtype == torch.bool:
attn_weight.masked_fill_(attn_mask, 0)
else:
attn_mask[attn_mask != float("-inf")] = 0
attn_mask[attn_mask == float("-inf")] = 1
attn_weight.masked_fill_(attn_mask, 0)
return attn_weight @ value
@torch.jit.script
class T2SMLP:
def __init__(self, w1, b1, w2, b2):
self.w1 = w1
self.b1 = b1
self.w2 = w2
self.b2 = b2
def forward(self, x):
x = F.relu(F.linear(x, self.w1, self.b1))
x = F.linear(x, self.w2, self.b2)
return x
@torch.jit.script
class T2SBlock:
def __init__(
self,
num_heads,
hidden_dim: int,
mlp: T2SMLP,
qkv_w,
qkv_b,
out_w,
out_b,
norm_w1,
norm_b1,
norm_eps1,
norm_w2,
norm_b2,
norm_eps2,
):
self.num_heads = num_heads
self.mlp = mlp
self.hidden_dim: int = hidden_dim
self.qkv_w = qkv_w
self.qkv_b = qkv_b
self.out_w = out_w
self.out_b = out_b
self.norm_w1 = norm_w1
self.norm_b1 = norm_b1
self.norm_eps1 = norm_eps1
self.norm_w2 = norm_w2
self.norm_b2 = norm_b2
self.norm_eps2 = norm_eps2
self.false = torch.tensor(False, dtype=torch.bool)
@torch.jit.ignore
def to_mask(
self,
x: torch.Tensor,
padding_mask: Optional[torch.Tensor],
):
if padding_mask is None:
return x
if padding_mask.dtype == torch.bool:
return x.masked_fill(padding_mask, 0)
else:
return x * padding_mask
def process_prompt(
self,
x: torch.Tensor,
attn_mask: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
torch_sdpa: bool = True,
):
q, k, v = F.linear(self.to_mask(x, padding_mask), self.qkv_w, self.qkv_b).chunk(3, dim=-1)
batch_size = q.shape[0]
q_len = q.shape[1]
kv_len = k.shape[1]
q = self.to_mask(q, padding_mask)
k_cache = self.to_mask(k, padding_mask)
v_cache = self.to_mask(v, padding_mask)
q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2)
k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
if torch_sdpa:
attn = F.scaled_dot_product_attention(q, k, v, ~attn_mask)
else:
attn = scaled_dot_product_attention(q, k, v, attn_mask)
attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1)
attn = F.linear(self.to_mask(attn, padding_mask), self.out_w, self.out_b)
x = x + attn
x = F.layer_norm(x, [self.hidden_dim], self.norm_w1, self.norm_b1, self.norm_eps1)
x = x + self.mlp.forward(x)
x = F.layer_norm(
x,
[self.hidden_dim],
self.norm_w2,
self.norm_b2,
self.norm_eps2,
)
return x, k_cache, v_cache
def decode_next_token(
self,
x: torch.Tensor,
k_cache: torch.Tensor,
v_cache: torch.Tensor,
attn_mask: torch.Tensor = None,
torch_sdpa: bool = True,
):
q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1)
k_cache = torch.cat([k_cache, k], dim=1)
v_cache = torch.cat([v_cache, v], dim=1)
batch_size = q.shape[0]
q_len = q.shape[1]
kv_len = k_cache.shape[1]
q = q.view(batch_size, q_len, self.num_heads, -1).transpose(1, 2)
k = k_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
v = v_cache.view(batch_size, kv_len, self.num_heads, -1).transpose(1, 2)
if torch_sdpa:
attn = F.scaled_dot_product_attention(q, k, v, (~attn_mask) if attn_mask is not None else None)
else:
attn = scaled_dot_product_attention(q, k, v, attn_mask)
attn = attn.transpose(1, 2).reshape(batch_size, q_len, -1)
attn = F.linear(attn, self.out_w, self.out_b)
x = x + attn
x = F.layer_norm(
x,
[self.hidden_dim],
self.norm_w1,
self.norm_b1,
self.norm_eps1,
)
x = x + self.mlp.forward(x)
x = F.layer_norm(
x,
[self.hidden_dim],
self.norm_w2,
self.norm_b2,
self.norm_eps2,
)
return x, k_cache, v_cache
@torch.jit.script
class T2STransformer:
def __init__(self, num_blocks: int, blocks: List[T2SBlock]):
self.num_blocks: int = num_blocks
self.blocks = blocks
def process_prompt(
self,
x: torch.Tensor,
attn_mask: torch.Tensor,
padding_mask: Optional[torch.Tensor] = None,
torch_sdpa: bool = True,
):
k_cache: List[torch.Tensor] = []
v_cache: List[torch.Tensor] = []
for i in range(self.num_blocks):
x, k_cache_, v_cache_ = self.blocks[i].process_prompt(x, attn_mask, padding_mask, torch_sdpa)
k_cache.append(k_cache_)
v_cache.append(v_cache_)
return x, k_cache, v_cache
def decode_next_token(
self,
x: torch.Tensor,
k_cache: List[torch.Tensor],
v_cache: List[torch.Tensor],
attn_mask: torch.Tensor = None,
torch_sdpa: bool = True,
):
for i in range(self.num_blocks):
x, k_cache[i], v_cache[i] = self.blocks[i].decode_next_token(
x, k_cache[i], v_cache[i], attn_mask, torch_sdpa
)
return x, k_cache, v_cache
class Text2SemanticDecoder(nn.Module):
def __init__(self, config, norm_first=False, top_k=3):
super(Text2SemanticDecoder, self).__init__()
self.model_dim = config["model"]["hidden_dim"]
self.embedding_dim = config["model"]["embedding_dim"]
self.num_head = config["model"]["head"]
self.num_layers = config["model"]["n_layer"]
self.norm_first = norm_first
self.vocab_size = config["model"]["vocab_size"]
self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
self.p_dropout = config["model"]["dropout"]
self.EOS = config["model"]["EOS"]
self.norm_first = norm_first
assert self.EOS == self.vocab_size - 1
# should be same as num of kmeans bin
# assert self.EOS == 1024
self.bert_proj = nn.Linear(1024, self.embedding_dim)
self.ar_text_embedding = TokenEmbedding(
self.embedding_dim,
self.phoneme_vocab_size,
self.p_dropout,
)
self.ar_text_position = SinePositionalEmbedding(
self.embedding_dim,
dropout=0.1,
scale=False,
alpha=True,
)
self.ar_audio_embedding = TokenEmbedding(
self.embedding_dim,
self.vocab_size,
self.p_dropout,
)
self.ar_audio_position = SinePositionalEmbedding(
self.embedding_dim,
dropout=0.1,
scale=False,
alpha=True,
)
self.h = TransformerEncoder(
TransformerEncoderLayer(
d_model=self.model_dim,
nhead=self.num_head,
dim_feedforward=self.model_dim * 4,
dropout=0.1,
batch_first=True,
norm_first=norm_first,
),
num_layers=self.num_layers,
norm=LayerNorm(self.model_dim) if norm_first else None,
)
self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False)
self.loss_fct = nn.CrossEntropyLoss(reduction="sum")
self.ar_accuracy_metric = MulticlassAccuracy(
self.vocab_size,
top_k=top_k,
average="micro",
multidim_average="global",
ignore_index=self.EOS,
)
blocks = []
for i in range(self.num_layers):
layer = self.h.layers[i]
t2smlp = T2SMLP(
layer.linear1.weight,
layer.linear1.bias,
layer.linear2.weight,
layer.linear2.bias,
)
block = T2SBlock(
self.num_head,
self.model_dim,
t2smlp,
layer.self_attn.in_proj_weight,
layer.self_attn.in_proj_bias,
layer.self_attn.out_proj.weight,
layer.self_attn.out_proj.bias,
layer.norm1.weight,
layer.norm1.bias,
layer.norm1.eps,
layer.norm2.weight,
layer.norm2.bias,
layer.norm2.eps,
)
blocks.append(block)
self.t2s_transformer = T2STransformer(self.num_layers, blocks)
def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
x_mask = make_pad_mask_left(x_lens)
y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64)
codes = y.type(torch.int64) * (1 - y_mask_int)
# Training
# AR Decoder
y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS)
x_len = x_lens.max()
y_len = y_lens.max()
y_emb = self.ar_audio_embedding(y)
y_pos = self.ar_audio_position(y_emb)
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
ar_xy_padding_mask = xy_padding_mask
x_attn_mask = F.pad(
torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
(0, y_len),
value=True,
)
# x_attn_mask[:, x_len]=False
y_attn_mask = F.pad(
torch.triu(
torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
diagonal=1,
),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
bsz, src_len = x.shape[0], x_len + y_len
_xy_padding_mask = (
ar_xy_padding_mask.view(bsz, 1, 1, src_len)
.expand(-1, self.num_head, -1, -1)
.reshape(bsz * self.num_head, 1, src_len)
)
xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
xy_attn_mask = new_attn_mask
# x 和完整的 y 一次性输入模型
xy_pos = torch.concat([x, y_pos], dim=1)
return xy_pos, xy_attn_mask, targets
def forward(self, x, x_lens, y, y_lens, bert_feature):
"""
x: phoneme_ids
y: semantic_ids
"""
reject_y, reject_y_lens = make_reject_y(y, y_lens)
xy_pos, xy_attn_mask, targets = self.make_input_data(x, x_lens, y, y_lens, bert_feature)
xy_dec, _ = self.h(
(xy_pos, None),
mask=xy_attn_mask,
)
x_len = x_lens.max()
logits = self.ar_predict_layer(xy_dec[:, x_len-1:])
###### DPO #############
reject_xy_pos, reject_xy_attn_mask, reject_targets = self.make_input_data(
x, x_lens, reject_y, reject_y_lens, bert_feature
)
reject_xy_dec, _ = self.h(
(reject_xy_pos, None),
mask=reject_xy_attn_mask,
)
x_len = x_lens.max()
reject_logits = self.ar_predict_layer(reject_xy_dec[:, x_len-1:])
# loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
loss_1 = F.cross_entropy(logits.permute(0, 2, 1), targets, reduction="sum")
acc = self.ar_accuracy_metric(logits.permute(0, 2, 1).detach(), targets).item()
A_logits, R_logits = get_batch_logps(logits, reject_logits, targets, reject_targets)
loss_2, _, _ = dpo_loss(A_logits, R_logits, 0, 0, 0.2, reference_free=True)
loss = loss_1 + loss_2
return loss, acc
def forward_old(self, x, x_lens, y, y_lens, bert_feature):
"""
x: phoneme_ids
y: semantic_ids
"""
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
x_mask = make_pad_mask_left(x_lens)
y_mask = make_pad_mask(y_lens)
y_mask_int = y_mask.type(torch.int64)
codes = y.type(torch.int64) * (1 - y_mask_int)
# Training
# AR Decoder
y, targets = self.pad_y_eos(codes, y_mask_int, eos_id=self.EOS)
x_len = x_lens.max()
y_len = y_lens.max()
y_emb = self.ar_audio_embedding(y)
y_pos = self.ar_audio_position(y_emb)
xy_padding_mask = torch.concat([x_mask, y_mask], dim=1)
ar_xy_padding_mask = xy_padding_mask
x_attn_mask = F.pad(
torch.zeros((x_len, x_len), dtype=torch.bool, device=x.device),
(0, y_len),
value=True,
)
y_attn_mask = F.pad(
torch.triu(
torch.ones(y_len, y_len, dtype=torch.bool, device=x.device),
diagonal=1,
),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask, y_attn_mask], dim=0)
bsz, src_len = x.shape[0], x_len + y_len
_xy_padding_mask = (
ar_xy_padding_mask.view(bsz, 1, 1, src_len)
.expand(-1, self.num_head, -1, -1)
.reshape(bsz * self.num_head, 1, src_len)
)
xy_attn_mask = xy_attn_mask.logical_or(_xy_padding_mask)
new_attn_mask = torch.zeros_like(xy_attn_mask, dtype=x.dtype)
new_attn_mask.masked_fill_(xy_attn_mask, float("-inf"))
xy_attn_mask = new_attn_mask
# x 和完整的 y 一次性输入模型
xy_pos = torch.concat([x, y_pos], dim=1)
xy_dec, _ = self.h(
(xy_pos, None),
mask=xy_attn_mask,
)
logits = self.ar_predict_layer(xy_dec[:, x_len-1:]).permute(0, 2, 1)
# loss
# from feiteng: 每次 duration 越多, 梯度更新也应该更多, 所以用 sum
loss = F.cross_entropy(logits, targets, reduction="sum")
acc = self.ar_accuracy_metric(logits.detach(), targets).item()
return loss, acc
# 需要看下这个函数和 forward 的区别以及没有 semantic 的时候 prompts 输入什么
def infer(
self,
x,
x_lens,
prompts,
bert_feature,
top_k: int = -100,
early_stop_num: int = -1,
temperature: float = 1.0,
):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
# AR Decoder
y = prompts
prefix_len = y.shape[1]
x_len = x.shape[1]
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
stop = False
for _ in tqdm(range(1500)):
y_emb = self.ar_audio_embedding(y)
y_pos = self.ar_audio_position(y_emb)
# x 和逐渐增长的 y 一起输入给模型
xy_pos = torch.concat([x, y_pos], dim=1)
y_len = y.shape[1]
x_attn_mask_pad = F.pad(
x_attn_mask,
(0, y_len),
value=True,
)
y_attn_mask = F.pad(
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0).to(y.device)
xy_dec, _ = self.h(
(xy_pos, None),
mask=xy_attn_mask,
)
logits = self.ar_predict_layer(xy_dec[:, -1])
samples = topk_sampling(logits, top_k=top_k, top_p=1.0, temperature=temperature)
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
print("use early stop num:", early_stop_num)
stop = True
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
# print(torch.argmax(logits, dim=-1)[0] == self.EOS, samples[0, 0] == self.EOS)
stop = True
if stop:
if prompts.shape[1] == y.shape[1]:
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
print("bad zero prediction")
print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
break
# 本次生成的 semantic_ids 和之前的 y 构成新的 y
# print(samples.shape)#[1,1]#第一个1是bs
# import os
# os._exit(2333)
y = torch.concat([y, samples], dim=1)
return y
def pad_y_eos(self, y, y_mask_int, eos_id):
targets = F.pad(y, (0, 1), value=0) + eos_id * F.pad(y_mask_int, (0, 1), value=1)
# 错位
return targets[:, :-1], targets
def infer_panel_batch_infer(
self,
x: List[torch.LongTensor], #####全部文本token
x_lens: torch.LongTensor,
prompts: torch.LongTensor, ####参考音频token
bert_feature: List[torch.LongTensor],
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
repetition_penalty: float = 1.35,
**kwargs,
):
if prompts is None:
print("Warning: Prompt free is not supported batch_infer! switch to naive_infer")
return self.infer_panel_naive_batched(
x,
x_lens,
prompts,
bert_feature,
top_k=top_k,
top_p=top_p,
early_stop_num=early_stop_num,
temperature=temperature,
**kwargs,
)
max_len = kwargs.get("max_len", x_lens.max())
x_list = []
for x_item, bert_item in zip(x, bert_feature):
# max_len = max(max_len, x_item.shape[0], bert_item.shape[1])
x_item = self.ar_text_embedding(x_item.unsqueeze(0))
x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0))
x_item = self.ar_text_position(x_item).squeeze(0)
# x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0]<max_len else x_item ### padding right
x_item = (
F.pad(x_item, (0, 0, max_len - x_item.shape[0], 0), value=0) if x_item.shape[0] < max_len else x_item
) ### padding left
x_list.append(x_item)
x: torch.Tensor = torch.stack(x_list, dim=0)
# AR Decoder
y = prompts
x_len = x.shape[1]
stop = False
k_cache = None
v_cache = None
################### first step ##########################
assert y is not None, "Error: Prompt free is not supported batch_infer!"
ref_free = False
y_emb = self.ar_audio_embedding(y)
y_len = y_emb.shape[1]
prefix_len = y.shape[1]
y_lens = torch.LongTensor([y_emb.shape[1]] * y_emb.shape[0]).to(x.device)
y_pos = self.ar_audio_position(y_emb)
xy_pos = torch.concat([x, y_pos], dim=1)
##### create mask #####
bsz = x.shape[0]
src_len = x_len + y_len
y_paddind_mask = make_pad_mask_left(y_lens, y_len)
x_paddind_mask = make_pad_mask_left(x_lens, max_len)
# (bsz, x_len + y_len)
padding_mask = torch.concat([x_paddind_mask, y_paddind_mask], dim=1)
x_mask = F.pad(
torch.zeros(x_len, x_len, dtype=torch.bool, device=x.device),
(0, y_len),
value=True,
)
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), diagonal=1),
(x_len, 0),
value=False,
)
causal_mask = torch.concat([x_mask, y_mask], dim=0).view(1, src_len, src_len).repeat(bsz, 1, 1).to(x.device)
# padding_mask = padding_mask.unsqueeze(1) * padding_mask.unsqueeze(2) ### [b, x+y, x+y]
### 上面是错误的,会导致padding的token被"看见"
# 正确的padding_mask应该是:
# | pad_len | x_len | y_len |
# [[PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6], 前3行按理说也应该被mask掉,但是为了防止计算attention时不出现nan,还是保留了,不影响结果
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6]]
padding_mask = padding_mask.view(bsz, 1, src_len).repeat(1, src_len, 1)
attn_mask: torch.Tensor = causal_mask.logical_or(padding_mask)
attn_mask = attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1).bool()
# 正确的attn_mask应该是这样的:
# | pad_len | x_len | y_len |
# [[PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS], 前3行按理说也应该被mask掉,但是为了防止计算attention时不出现nan,还是保留了,不影响结果
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, 4, EOS, EOS],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, EOS],
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6]]
###### decode #####
y_list = [None] * y.shape[0]
batch_idx_map = list(range(y.shape[0]))
idx_list = [None] * y.shape[0]
for idx in tqdm(range(1500)):
if idx == 0:
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, attn_mask, None)
else:
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache, attn_mask)
logits = self.ar_predict_layer(xy_dec[:, -1])
if idx == 0:
attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
else:
attn_mask = F.pad(attn_mask, (0, 1), value=False)
if idx < 11: ###至少预测出10个token不然不给停止(0.4s)
logits = logits[:, :-1]
samples = sample(
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
)[0]
y = torch.concat([y, samples], dim=1)
####### 移除batch中已经生成完毕的序列,进一步优化计算量
tokens = torch.argmax(logits, dim=-1)
reserved_idx_of_batch_for_y = None
if (self.EOS in samples[:, 0]) or (self.EOS in tokens): ###如果生成到EOS,则停止
l1 = samples[:, 0] == self.EOS
l2 = tokens == self.EOS
l = l1.logical_or(l2)
removed_idx_of_batch_for_y = torch.where(l == True)[0].tolist()
reserved_idx_of_batch_for_y = torch.where(l == False)[0]
# batch_indexs = torch.tensor(batch_idx_map, device=y.device)[removed_idx_of_batch_for_y]
for i in removed_idx_of_batch_for_y:
batch_index = batch_idx_map[i]
idx_list[batch_index] = idx
y_list[batch_index] = y[i, :-1]
batch_idx_map = [batch_idx_map[i] for i in reserved_idx_of_batch_for_y.tolist()]
# 只保留batch中未生成完毕的序列
if reserved_idx_of_batch_for_y is not None:
# index = torch.LongTensor(batch_idx_map).to(y.device)
y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
attn_mask = torch.index_select(attn_mask, dim=0, index=reserved_idx_of_batch_for_y)
if k_cache is not None:
for i in range(len(k_cache)):
k_cache[i] = torch.index_select(k_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
v_cache[i] = torch.index_select(v_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
if (early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num) or idx == 1499:
print("use early stop num:", early_stop_num)
stop = True
for i, batch_index in enumerate(batch_idx_map):
batch_index = batch_idx_map[i]
idx_list[batch_index] = idx
y_list[batch_index] = y[i, :-1]
if None not in idx_list:
stop = True
if stop:
if y.shape[1] == 0:
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
print("bad zero prediction")
print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
break
####################### update next step ###################################
y_emb = self.ar_audio_embedding(y[:, -1:])
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[
:, y_len + idx
].to(dtype=y_emb.dtype, device=y_emb.device)
if None in idx_list:
for i in range(x.shape[0]):
if idx_list[i] is None:
idx_list[i] = 1500 - 1 ###如果没有生成到EOS,就用最大长度代替
if ref_free:
return y_list, [0] * x.shape[0]
# print(idx_list)
return y_list, idx_list
def infer_panel_naive_batched(
self,
x: List[torch.LongTensor], #####全部文本token
x_lens: torch.LongTensor,
prompts: torch.LongTensor, ####参考音频token
bert_feature: List[torch.LongTensor],
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
repetition_penalty: float = 1.35,
**kwargs,
):
y_list = []
idx_list = []
for i in range(len(x)):
y, idx = next(self.infer_panel_naive(
x[i].unsqueeze(0),
x_lens[i],
prompts[i].unsqueeze(0) if prompts is not None else None,
bert_feature[i].unsqueeze(0),
top_k,
top_p,
early_stop_num,
temperature,
repetition_penalty,
**kwargs,
))
y_list.append(y[0])
idx_list.append(idx)
return y_list, idx_list
def infer_panel_naive(
self,
x: torch.LongTensor, #####全部文本token
x_lens: torch.LongTensor,
prompts: torch.LongTensor, ####参考音频token
bert_feature: torch.LongTensor,
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
repetition_penalty: float = 1.35,
streaming_mode: bool = False,
chunk_length: int = 24,
**kwargs,
):
mute_emb_sim_matrix = kwargs.get("mute_emb_sim_matrix", None)
chunk_split_thershold = kwargs.get("chunk_split_thershold", 0.3)
check_token_num = 2
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
x = self.ar_text_position(x)
# AR Decoder
y = prompts
x_len = x.shape[1]
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
stop = False
# print(1111111,self.num_layers)
k_cache = None
v_cache = None
################### first step ##########################
if y is not None:
y_emb = self.ar_audio_embedding(y)
y_len = y_emb.shape[1]
prefix_len = y.shape[1]
y_pos = self.ar_audio_position(y_emb)
xy_pos = torch.concat([x, y_pos], dim=1)
ref_free = False
else:
y_emb = None
y_len = 0
prefix_len = 0
y_pos = None
xy_pos = x
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
ref_free = True
bsz = x.shape[0]
src_len = x_len + y_len
x_attn_mask_pad = F.pad(
x_attn_mask,
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
value=True,
)
y_attn_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
(x_len, 0),
value=False,
)
xy_attn_mask = (
torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
.unsqueeze(0)
.expand(bsz * self.num_head, -1, -1)
.view(bsz, self.num_head, src_len, src_len)
.to(device=x.device, dtype=torch.bool)
)
token_counter = 0
curr_ptr = prefix_len
for idx in tqdm(range(1500)):
token_counter+=1
if xy_attn_mask is not None:
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None)
else:
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache)
logits = self.ar_predict_layer(xy_dec[:, -1])
if idx == 0:
xy_attn_mask = None
if idx < 11: ###至少预测出10个token不然不给停止(0.4s)
logits = logits[:, :-1]
samples = sample(
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
)[0]
y = torch.concat([y, samples], dim=1)
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
print("use early stop num:", early_stop_num)
stop = True
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
stop = True
y=y[:, :-1]
token_counter -= 1
if idx == 1499:
stop = True
if stop:
if y.shape[1] == 0:
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
print("bad zero prediction")
# print(f"T2S Decoding EOS [{prefix_len} -> {y.shape[1]}]")
if streaming_mode:
yield y[:, curr_ptr:] if curr_ptr<y.shape[1] else None, True
break
if streaming_mode and (mute_emb_sim_matrix is not None) and (token_counter >= chunk_length+check_token_num):
score = mute_emb_sim_matrix[y[0, curr_ptr:]] - chunk_split_thershold
score[score<0]=-1
score[:-1]=score[:-1]+score[1:] ##考虑连续两个token
argmax_idx = score.argmax()
if score[argmax_idx]>=0 and argmax_idx+1>=chunk_length:
print(f"\n\ncurr_ptr:{curr_ptr}")
yield y[:, curr_ptr:], False
token_counter -= argmax_idx+1
curr_ptr += argmax_idx+1
elif streaming_mode and (mute_emb_sim_matrix is None) and (token_counter >= chunk_length):
yield y[:, -token_counter:], False
curr_ptr+=token_counter
token_counter = 0
####################### update next step ###################################
y_emb = self.ar_audio_embedding(y[:, -1:])
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[
:, y_len + idx
].to(dtype=y_emb.dtype, device=y_emb.device)
if not streaming_mode:
if ref_free:
yield y, 0
yield y, idx
def infer_panel(
self,
x: torch.LongTensor, #####全部文本token
x_lens: torch.LongTensor,
prompts: torch.LongTensor, ####参考音频token
bert_feature: torch.LongTensor,
top_k: int = -100,
top_p: int = 100,
early_stop_num: int = -1,
temperature: float = 1.0,
repetition_penalty: float = 1.35,
**kwargs,
):
return next(self.infer_panel_naive(
x, x_lens, prompts, bert_feature, top_k, top_p, early_stop_num, temperature, repetition_penalty, **kwargs
))
================================================
FILE: GPT_SoVITS/AR/models/t2s_model_onnx.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py
# reference: https://github.com/lifeiteng/vall-e
import torch
from torch import nn
from torch.nn import functional as F
from torchmetrics.classification import MulticlassAccuracy
from AR.modules.embedding_onnx import SinePositionalEmbedding, TokenEmbedding
from AR.modules.transformer_onnx import LayerNorm, TransformerEncoder, TransformerEncoderLayer
default_config = {
"embedding_dim": 512,
"hidden_dim": 512,
"num_head": 8,
"num_layers": 12,
"num_codebook": 8,
"p_dropout": 0.0,
"vocab_size": 1024 + 1,
"phoneme_vocab_size": 512,
"EOS": 1024,
}
inf_tensor_value = torch.FloatTensor([-float("Inf")]).float()
def logits_to_probs(
logits,
previous_tokens=None,
temperature: float = 1.0,
top_k=None,
top_p=None,
repetition_penalty: float = 1.0,
):
previous_tokens = previous_tokens.squeeze()
if previous_tokens is not None and repetition_penalty != 1.0:
previous_tokens = previous_tokens.long()
score = torch.gather(logits, dim=0, index=previous_tokens)
score = torch.where(
score < 0,
score * repetition_penalty,
score / repetition_penalty,
)
logits.scatter_(dim=0, index=previous_tokens, src=score)
if top_p is not None and top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cum_probs = torch.cumsum(
torch.nn.functional.softmax(
sorted_logits,
dim=-1,
),
dim=-1,
)
sorted_indices_to_remove = cum_probs > top_p
sorted_indices_to_remove[0] = False # keep at least one option
indices_to_remove = sorted_indices_to_remove.scatter(
dim=0,
index=sorted_indices,
src=sorted_indices_to_remove,
)
logits = logits.masked_fill(indices_to_remove, -float("Inf"))
logits = logits / max(temperature, 1e-5)
if top_k is not None:
v, _ = torch.topk(logits, top_k)
pivot = v.select(-1, -1).unsqueeze(-1)
logits = torch.where(logits < pivot, inf_tensor_value, logits)
probs = torch.nn.functional.softmax(logits, dim=-1)
return probs
def multinomial_sample_one_no_sync(
probs_sort,
): # Does multinomial sampling without a cuda synchronization
q = torch.randn_like(probs_sort)
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
def sample(
logits,
previous_tokens,
**sampling_kwargs,
):
probs = logits_to_probs(
logits=logits,
previous_tokens=previous_tokens,
**sampling_kwargs,
)
idx_next = multinomial_sample_one_no_sync(probs)
return idx_next, probs
class OnnxEncoder(nn.Module):
def __init__(self, ar_text_embedding, bert_proj, ar_text_position):
super().__init__()
self.ar_text_embedding = ar_text_embedding
self.bert_proj = bert_proj
self.ar_text_position = ar_text_position
def forward(self, x, bert_feature):
x = self.ar_text_embedding(x)
x = x + self.bert_proj(bert_feature.transpose(1, 2))
return self.ar_text_position(x)
class T2SFirstStageDecoder(nn.Module):
def __init__(
self,
ar_audio_embedding,
ar_audio_position,
h,
ar_predict_layer,
loss_fct,
ar_accuracy_metric,
top_k,
early_stop_num,
num_layers,
):
super().__init__()
self.ar_audio_embedding = ar_audio_embedding
self.ar_audio_position = ar_audio_position
self.h = h
self.ar_predict_layer = ar_predict_layer
self.loss_fct = loss_fct
self.ar_accuracy_metric = ar_accuracy_metric
self.top_k = top_k
self.early_stop_num = early_stop_num
self.num_layers = num_layers
def forward(self, x, prompt):
y = prompt
x_example = x[:, :, 0] * 0.0
# N, 1, 512
cache = {
"all_stage": self.num_layers,
"k": None,
"v": None,
"y_emb": None,
"first_infer": 1,
"stage": 0,
}
y_emb = self.ar_audio_embedding(y)
cache["y_emb"] = y_emb
y_pos = self.ar_audio_position(y_emb)
xy_pos = torch.concat([x, y_pos], dim=1)
y_example = y_pos[:, :, 0] * 0.0
x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example).bool()
y_attn_mask = torch.ones_like(torch.matmul(y_example.transpose(0, 1), y_example), dtype=torch.int64)
y_attn_mask = torch.cumsum(y_attn_mask, dim=1) - torch.cumsum(
torch.ones_like(
y_example.transpose(0, 1),
dtype=torch.int64,
),
dim=0,
)
y_attn_mask = y_attn_mask > 0
x_y_pad = torch.matmul(x_example.transpose(0, 1), y_example).bool()
y_x_pad = torch.matmul(y_example.transpose(0, 1), x_example).bool()
x_attn_mask_pad = torch.cat([x_attn_mask, torch.ones_like(x_y_pad)], dim=1)
y_attn_mask = torch.cat([y_x_pad, y_attn_mask], dim=1)
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
cache["k"] = (
torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))
.unsqueeze(1)
.repeat(self.num_layers, 1, 1, 1)
)
cache["v"] = (
torch.matmul(x_attn_mask_pad[0].float().unsqueeze(-1), torch.zeros((1, 512)))
.unsqueeze(1)
.repeat(self.num_layers, 1, 1, 1)
)
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
logits = self.ar_predict_layer(xy_dec[:, -1])
samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
y = torch.concat([y, samples], dim=1)
return y, cache["k"], cache["v"], cache["y_emb"], x_example
class T2SStageDecoder(nn.Module):
def __init__(
self,
ar_audio_embedding,
ar_audio_position,
h,
ar_predict_layer,
loss_fct,
ar_accuracy_metric,
top_k,
early_stop_num,
num_layers,
):
super().__init__()
self.ar_audio_embedding = ar_audio_embedding
self.ar_audio_position = ar_audio_position
self.h = h
self.ar_predict_layer = ar_predict_layer
self.loss_fct = loss_fct
self.ar_accuracy_metric = ar_accuracy_metric
self.top_k = top_k
self.early_stop_num = early_stop_num
self.num_layers = num_layers
def forward(self, y, k, v, y_emb, x_example):
cache = {
"all_stage": self.num_layers,
"k": torch.nn.functional.pad(k, (0, 0, 0, 0, 0, 1)),
"v": torch.nn.functional.pad(v, (0, 0, 0, 0, 0, 1)),
"y_emb": y_emb,
"first_infer": 0,
"stage": 0,
}
y_emb = torch.cat(
[
cache["y_emb"],
self.ar_audio_embedding(y[:, -1:]),
],
1,
)
cache["y_emb"] = y_emb
y_pos = self.ar_audio_position(y_emb)
xy_pos = y_pos[:, -1:]
y_example = y_pos[:, :, 0] * 0.0
xy_attn_mask = torch.cat([x_example, y_example], dim=1)
xy_attn_mask = torch.zeros_like(xy_attn_mask, dtype=torch.bool)
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
logits = self.ar_predict_layer(xy_dec[:, -1])
samples = sample(logits[0], y, top_k=self.top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
y = torch.concat([y, samples], dim=1)
return y, cache["k"], cache["v"], cache["y_emb"], logits, samples
class Text2SemanticDecoder(nn.Module):
def __init__(self, config, norm_first=False, top_k=3):
super(Text2SemanticDecoder, self).__init__()
self.model_dim = config["model"]["hidden_dim"]
self.embedding_dim = config["model"]["embedding_dim"]
self.num_head = config["model"]["head"]
self.num_layers = config["model"]["n_layer"]
self.norm_first = norm_first
self.vocab_size = config["model"]["vocab_size"]
self.phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
self.p_dropout = float(config["model"]["dropout"])
self.EOS = config["model"]["EOS"]
self.norm_first = norm_first
assert self.EOS == self.vocab_size - 1
self.bert_proj = nn.Linear(1024, self.embedding_dim)
self.ar_text_embedding = TokenEmbedding(self.embedding_dim, self.phoneme_vocab_size, self.p_dropout)
self.ar_text_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
self.ar_audio_embedding = TokenEmbedding(self.embedding_dim, self.vocab_size, self.p_dropout)
self.ar_audio_position = SinePositionalEmbedding(self.embedding_dim, dropout=0.1, scale=False, alpha=True)
self.h = TransformerEncoder(
TransformerEncoderLayer(
d_model=self.model_dim,
nhead=self.num_head,
dim_feedforward=self.model_dim * 4,
dropout=0.1,
batch_first=True,
norm_first=norm_first,
),
num_layers=self.num_layers,
norm=LayerNorm(self.model_dim) if norm_first else None,
)
self.ar_predict_layer = nn.Linear(self.model_dim, self.vocab_size, bias=False)
self.loss_fct = nn.CrossEntropyLoss(reduction="sum")
self.ar_accuracy_metric = MulticlassAccuracy(
self.vocab_size,
top_k=top_k,
average="micro",
multidim_average="global",
ignore_index=self.EOS,
)
self.top_k = torch.LongTensor([1])
self.early_stop_num = torch.LongTensor([-1])
def init_onnx(self):
self.onnx_encoder = OnnxEncoder(self.ar_text_embedding, self.bert_proj, self.ar_text_position)
self.first_stage_decoder = T2SFirstStageDecoder(
self.ar_audio_embedding,
self.ar_audio_position,
self.h,
self.ar_predict_layer,
self.loss_fct,
self.ar_accuracy_metric,
self.top_k,
self.early_stop_num,
self.num_layers,
)
self.stage_decoder = T2SStageDecoder(
self.ar_audio_embedding,
self.ar_audio_position,
self.h,
self.ar_predict_layer,
self.loss_fct,
self.ar_accuracy_metric,
self.top_k,
self.early_stop_num,
self.num_layers,
)
def forward(self, x, prompts, bert_feature):
early_stop_num = self.early_stop_num
prefix_len = prompts.shape[1]
x = self.onnx_encoder(x, bert_feature)
y, k, v, y_emb, stage, x_example = self.first_stage_decoder(x, prompts)
stop = False
for idx in range(1, 1500):
enco = self.stage_decoder(y, k, v, y_emb, stage, x_example)
y, k, v, y_emb, stage, logits, samples = enco
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
stop = True
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
stop = True
if stop:
break
y[0, -1] = 0
return y, idx
def infer(self, x, prompts, bert_feature):
top_k = self.top_k
early_stop_num = self.early_stop_num
x = self.onnx_encoder(x, bert_feature)
y = prompts
prefix_len = y.shape[1]
x_len = x.shape[1]
x_example = x[:, :, 0] * 0.0
x_attn_mask = torch.matmul(x_example.transpose(0, 1), x_example)
x_attn_mask = torch.zeros_like(x_attn_mask, dtype=torch.bool)
stop = False
cache = {
"all_stage": self.num_layers,
"k": [None] * self.num_layers,
"v": [None] * self.num_layers,
"y_emb": None,
"first_infer": 1,
"stage": 0,
}
for idx in range(1500):
if cache["first_infer"] == 1:
y_emb = self.ar_audio_embedding(y)
else:
y_emb = torch.cat([cache["y_emb"], self.ar_audio_embedding(y[:, -1:])], 1)
cache["y_emb"] = y_emb
y_pos = self.ar_audio_position(y_emb)
if cache["first_infer"] == 1:
xy_pos = torch.concat([x, y_pos], dim=1)
else:
xy_pos = y_pos[:, -1:]
y_len = y_pos.shape[1]
if cache["first_infer"] == 1:
x_attn_mask_pad = F.pad(x_attn_mask, (0, y_len), value=True)
y_attn_mask = F.pad(
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
(x_len, 0),
value=False,
)
xy_attn_mask = torch.concat([x_attn_mask_pad, y_attn_mask], dim=0)
else:
xy_attn_mask = torch.zeros((1, x_len + y_len), dtype=torch.bool)
xy_dec = self.h(xy_pos, mask=xy_attn_mask, cache=cache)
logits = self.ar_predict_layer(xy_dec[:, -1])
samples = sample(logits[0], y, top_k=top_k, top_p=1.0, repetition_penalty=1.35)[0].unsqueeze(0)
if early_stop_num != -1 and (y.shape[1] - prefix_len) > early_stop_num:
stop = True
if torch.argmax(logits, dim=-1)[0] == self.EOS or samples[0, 0] == self.EOS:
stop = True
if stop:
if prompts.shape[1] == y.shape[1]:
y = torch.concat([y, torch.zeros_like(samples)], dim=1)
break
y = torch.concat([y, samples], dim=1)
cache["first_infer"] = 0
return y, idx
================================================
FILE: GPT_SoVITS/AR/models/utils.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py
# reference: https://github.com/lifeiteng/vall-e
from typing import Tuple
import torch
import torch.nn.functional as F
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
"""
Args:
lengths:
A 1-D tensor containing sentence lengths.
max_len:
The length of masks.
Returns:
Return a 2-D bool tensor, where masked positions
are filled with `True` and non-masked positions are
filled with `False`.
#>>> lengths = torch.tensor([1, 3, 2, 5])
#>>> make_pad_mask(lengths)
tensor([[False, True, True, True, True],
[False, False, False, True, True],
[False, False, True, True, True],
[False, False, False, False, False]])
"""
assert lengths.ndim == 1, lengths.ndim
max_len = max(max_len, lengths.max())
n = lengths.size(0)
seq_range = torch.arange(0, max_len, device=lengths.device)
expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
return expaned_lengths >= lengths.unsqueeze(-1)
def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
"""
Args:
lengths:
A 1-D tensor containing sentence lengths.
max_len:
The length of masks.
Returns:
Return a 2-D bool tensor, where masked positions
are filled with `True` and non-masked positions are
filled with `False`.
#>>> lengths = torch.tensor([1, 3, 2, 5])
#>>> make_pad_mask(lengths)
tensor(
[
[True, True, False],
[True, False, False],
[True, True, False],
...
]
)
"""
assert lengths.ndim == 1, lengths.ndim
max_len = max(max_len, lengths.max())
n = lengths.size(0)
seq_range = torch.arange(0, max_len, device=lengths.device)
expaned_lengths = seq_range.unsqueeze(0).repeat(n, 1)
expaned_lengths -= (max_len - lengths).unsqueeze(-1)
return expaned_lengths < 0
# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
def top_k_top_p_filtering(
logits,
top_k=0,
top_p=1.0,
filter_value=-float("Inf"),
min_tokens_to_keep=1,
):
"""Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (batch size, vocabulary size)
if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
Make sure we keep at least min_tokens_to_keep per batch example in the output
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
"""
if top_k > 0:
top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
logits[indices_to_remove] = filter_value
if top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
# Remove tokens with cumulative probability above the threshold (token with 0 are kept)
sorted_indices_to_remove = cumulative_probs > top_p
if min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
# Shift the indices to the right to keep also the first token above the threshold
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
# scatter sorted tensors to original indexing
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
logits[indices_to_remove] = filter_value
return logits
def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
# temperature: (`optional`) float
# The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
# top_k: (`optional`) int
# The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
# top_p: (`optional`) float
# The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
# Temperature (higher temperature => more likely to sample low probability tokens)
if temperature != 1.0:
logits = logits / temperature
# Top-p/top-k filtering
logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
# Sample
token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
return token
from typing import Optional
def multinomial_sample_one_no_sync(
probs_sort,
): # Does multinomial sampling without a cuda synchronization
q = torch.empty_like(probs_sort).exponential_(1)
return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
def logits_to_probs(
logits,
previous_tokens: Optional[torch.Tensor] = None,
temperature: float = 1.0,
top_k: Optional[int] = None,
top_p: Optional[int] = None,
repetition_penalty: float = 1.0,
):
# if previous_tokens is not None:
# previous_tokens = previous_tokens.squeeze()
# print(logits.shape,previous_tokens.shape)
# pdb.set_trace()
if previous_tokens is not None and repetition_penalty != 1.0:
previous_tokens = previous_tokens.long()
score = torch.gather(logits, dim=1, index=previous_tokens)
score = torch.where(
score < 0,
score * repetition_penalty,
score / repetition_penalty,
)
logits.scatter_(dim=1, index=previous_tokens, src=score)
if top_p is not None and top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cum_probs > top_p
sorted_indices_to_remove[:, 0] = False # keep at least one option
indices_to_remove = sorted_indices_to_remove.scatter(
dim=1,
index=sorted_indices,
src=sorted_indices_to_remove,
)
logits = logits.masked_fill(indices_to_remove, -float("Inf"))
logits = logits / max(temperature, 1e-5)
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
pivot = v[:, -1].unsqueeze(-1)
logits = torch.where(logits < pivot, -float("Inf"), logits)
probs = torch.nn.functional.softmax(logits, dim=-1)
return probs
def sample(
logits,
previous_tokens: Optional[torch.Tensor] = None,
**sampling_kwargs,
) -> Tuple[torch.Tensor, torch.Tensor]:
probs = logits_to_probs(logits=logits, previous_tokens=previous_tokens, **sampling_kwargs)
idx_next = multinomial_sample_one_no_sync(probs)
return idx_next, probs
def dpo_loss(
policy_chosen_logps: torch.FloatTensor,
policy_rejected_logps: torch.FloatTensor,
reference_chosen_logps: torch.FloatTensor,
reference_rejected_logps: torch.FloatTensor,
beta: float,
reference_free: bool = False,
) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
pi_logratios = policy_chosen_logps - policy_rejected_logps
ref_logratios = reference_chosen_logps - reference_rejected_logps
if reference_free:
ref_logratios = 0
logits = pi_logratios - ref_logratios
losses = -F.logsigmoid(beta * logits)
chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach()
rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach()
return losses.mean(), chosen_rewards, rejected_rewards
def get_batch_logps(
logits_target: torch.FloatTensor,
logits_reject: torch.FloatTensor,
labels_target: torch.LongTensor,
labels_reject: torch.LongTensor,
average_log_prob: bool = False,
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
# dummy token; we'll ignore the losses on these tokens later
per_token_logps_target = torch.gather(
logits_target.log_softmax(-1), dim=2, index=labels_target.unsqueeze(2)
).squeeze(2)
per_token_logps_reject = torch.gather(
logits_reject.log_softmax(-1), dim=2, index=labels_reject.unsqueeze(2)
).squeeze(2)
return per_token_logps_target.sum(-1), per_token_logps_reject.sum(-1)
def make_reject_y(y_o, y_lens):
def repeat_P(y):
range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
pre = y[: range_idx[0]]
shf = y[range_idx[1] :]
range_text = y[range_idx[0] : range_idx[1]]
new_y = torch.cat([pre, range_text, range_text, shf])
return new_y
def lost_P(y):
range_idx, _ = torch.randint(0, len(y), size=(2,)).sort()
pre = y[: range_idx[0]]
shf = y[range_idx[1] :]
range_text = y[range_idx[0] : range_idx[1]]
new_y = torch.cat([pre, shf])
return new_y
bs = len(y_lens)
reject_y = []
reject_y_lens = []
for b in range(bs):
process_item_idx = torch.randint(0, 1, size=(1,))[0]
if process_item_idx == 0:
new_y = repeat_P(y_o[b])
reject_y.append(new_y)
reject_y_lens.append(len(new_y))
elif process_item_idx == 1:
new_y = lost_P(y_o[b])
reject_y.append(new_y)
reject_y_lens.append(len(new_y))
max_length = max(reject_y_lens)
for b in range(bs):
pad_length = max_length - reject_y_lens[b]
reject_y[b] = torch.cat([reject_y[b], torch.zeros(pad_length, dtype=y_o.dtype, device=y_o.device)], dim=0)
reject_y = torch.stack(reject_y, dim=0)
reject_y_lens = torch.tensor(reject_y_lens, device=y_lens.device)
return reject_y, reject_y_lens
================================================
FILE: GPT_SoVITS/AR/modules/__init__.py
================================================
================================================
FILE: GPT_SoVITS/AR/modules/activation.py
================================================
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
from typing import Optional, Tuple
import torch
from torch import Tensor
from torch.nn import Linear, Module
from torch.nn import functional as F
from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
from torch.nn.parameter import Parameter
from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched
F.multi_head_attention_forward = multi_head_attention_forward_patched
class MultiheadAttention(Module):
r"""Allows the model to jointly attend to information
from different representation subspaces as described in the paper:
`Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
Multi-Head Attention is defined as:
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
``forward()`` will use a special optimized implementation if all of the following
conditions are met:
- self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
restriction will be loosened in the future.)
- Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
- training is disabled (using ``.eval()``)
- dropout is 0
- ``add_bias_kv`` is ``False``
- ``add_zero_attn`` is ``False``
- ``batch_first`` is ``True`` and the input is batched
- ``kdim`` and ``vdim`` are equal to ``embed_dim``
- at most one of ``key_padding_mask`` or ``attn_mask`` is passed
- if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
nor ``attn_mask`` is passed
If the optimized implementation is in use, a
`NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
``query``/``key``/``value`` to represent padding more efficiently than using a
padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
will be returned, and an additional speedup proportional to the fraction of the input
that is padding can be expected.
Args:
embed_dim: Total dimension of the model.
num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
bias: If specified, adds bias to input / output projection layers. Default: ``True``.
add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
Default: ``False``.
kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
batch_first: If ``True``, then the input and output tensors are provided
as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
Examples::
>>> # xdoctest: +SKIP
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
"""
__constants__ = ["batch_first"]
bias_k: Optional[torch.Tensor]
bias_v: Optional[torch.Tensor]
def __init__(
self,
embed_dim,
num_heads,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
kdim=None,
vdim=None,
batch_first=False,
linear1_cls=Linear,
linear2_cls=Linear,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.batch_first = batch_first
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
if add_bias_kv:
self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
else:
self.bias_k = self.bias_v = None
if linear1_cls == Linear:
if not self._qkv_same_embed_dim:
self.q_proj_weight = Parameter(
torch.empty((embed_dim, embed_dim), **factory_kwargs),
)
self.k_proj_weight = Parameter(
torch.empty((embed_dim, self.kdim), **factory_kwargs),
)
self.v_proj_weight = Parameter(
torch.empty((embed_dim, self.vdim), **factory_kwargs),
)
self.register_parameter("in_proj_weight", None)
else:
self.in_proj_weight = Parameter(
torch.empty((3 * embed_dim, embed_dim), **factory_kwargs),
)
self.register_parameter("q_proj_weight", None)
self.register_parameter("k_proj_weight", None)
self.register_parameter("v_proj_weight", None)
if bias:
self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
else:
self.register_parameter("in_proj_bias", None)
self.out_proj = NonDynamicallyQuantizableLinear(
embed_dim,
embed_dim,
bias=bias,
**factory_kwargs,
)
self._reset_parameters()
else:
if not self._qkv_same_embed_dim:
raise NotImplementedError
else:
self.in_proj_linear = linear1_cls(
embed_dim,
3 * embed_dim,
bias=bias,
**factory_kwargs,
)
self.in_proj_weight = self.in_proj_linear.weight
self.register_parameter("q_proj_weight", None)
self.register_parameter("k_proj_weight", None)
self.register_parameter("v_proj_weight", None)
if bias:
self.in_proj_bias = self.in_proj_linear.bias
else:
self.register_parameter("in_proj_bias", None)
self.out_proj = linear2_cls(
embed_dim,
embed_dim,
bias=bias,
**factory_kwargs,
)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
self.add_zero_attn = add_zero_attn
def _reset_parameters(self):
if self._qkv_same_embed_dim:
xavier_uniform_(self.in_proj_weight)
else:
xavier_uniform_(self.q_proj_weight)
xavier_uniform_(self.k_proj_weight)
xavier_uniform_(self.v_proj_weight)
if self.in_proj_bias is not None:
constant_(self.in_proj_bias, 0.0)
constant_(self.out_proj.bias, 0.0)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
def __setstate__(self, state):
# Support loading old MultiheadAttention checkpoints generated by v1.1.0
if "_qkv_same_embed_dim" not in state:
state["_qkv_same_embed_dim"] = True
super(MultiheadAttention, self).__setstate__(state)
def forward(
self,
query: Tensor,
key: Tensor,
value: Tensor,
key_padding_mask: Optional[Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[Tensor] = None,
average_attn_weights: bool = True,
cache=None,
) -> Tuple[Tensor, Optional[Tensor]]:
r"""
Args:
query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
:math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
Queries are compared against key-value pairs to produce the output.
See "Attention Is All You Need" for more details.
key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
:math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
See "Attention Is All You Need" for more details.
value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
See "Attention Is All You Need" for more details.
key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
Binary and byte masks are supported.
For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
Default: ``True``.
attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
:math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
:math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
corresponding position is not allowed to attend. For a float mask, the mask values will be added to
the attention weight.
average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
Outputs:
- **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
:math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
embedding dimension ``embed_dim``.
- **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
:math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
:math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
.. note::
`batch_first` argument is ignored for unbatched inputs.
"""
is_batched = query.dim() == 3
if key_padding_mask is not None:
_kpm_dtype = key_padding_mask.dtype
if _kpm_dtype != torch.bool and not torch.is_floating_point(
key_padding_mask,
):
raise AssertionError("only bool and floating types of key_padding_mask are supported")
why_not_fast_path = ""
if not is_batched:
why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
elif query is not key or key is not value:
# When lifting this restriction, don't forget to either
# enforce that the dtypes all match or test cases where
# they don't!
why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
why_not_fast_path = (
f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
)
elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype:
# this case will fail anyway, but at least they'll get a useful error message.
why_not_fast_path = (
f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
)
elif self.training:
why_not_fast_path = "training is enabled"
elif not self.batch_first:
why_not_fast_path = "batch_first was not True"
elif self.bias_k is not None:
why_not_fast_path = "self.bias_k was not None"
elif self.bias_v is not None:
why_not_fast_path = "self.bias_v was not None"
elif self.dropout:
why_not_fast_path = f"dropout was {self.dropout}, required zero"
elif self.add_zero_attn:
why_not_fast_path = "add_zero_attn was enabled"
elif not self._qkv_same_embed_dim:
why_not_fast_path = "_qkv_same_embed_dim was not True"
elif attn_mask is not None:
why_not_fast_path = "attn_mask was not None"
elif query.is_nested and key_padding_mask is not None:
why_not_fast_path = "key_padding_mask is not supported with NestedTensor input"
elif self.num_heads % 2 == 1:
why_not_fast_path = "num_heads is odd"
elif torch.is_autocast_enabled():
why_not_fast_path = "autocast is enabled"
if not why_not_fast_path:
tensor_args = (
query,
key,
value,
self.in_proj_weight,
self.in_proj_bias,
self.out_proj.weight,
self.out_proj.bias,
)
# We have to use list comprehensions below because TorchScript does not support
# generator expressions.
if torch.overrides.has_torch_function(tensor_args):
why_not_fast_path = "some Tensor argument has_torch_function"
elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args]):
why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]):
why_not_fast_path = "grad is enabled and at least one of query or the input/output projection weights or biases requires_grad"
if not why_not_fast_path:
return torch._native_multi_head_attention(
query,
key,
value,
self.embed_dim,
self.num_heads,
self.in_proj_weight,
self.in_proj_bias,
self.out_proj.weight,
self.out_proj.bias,
key_padding_mask if key_padding_mask is not None else attn_mask,
need_weights,
average_attn_weights,
1 if key_padding_mask is not None else 0 if attn_mask is not None else None,
)
any_nested = query.is_nested or key.is_nested or value.is_nested
assert not any_nested, (
"MultiheadAttention does not support NestedTensor outside of its fast path. "
+ f"The fast path was not hit because {why_not_fast_path}"
)
if self.batch_first and is_batched:
# make sure that the transpose op does not affect the "is" property
if key is value:
if query is key:
query = key = value = query.transpose(1, 0)
else:
query, key = [x.transpose(1, 0) for x in (query, key)]
value = key
else:
query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
if not self._qkv_same_embed_dim:
attn_output, attn_output_weights = F.multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.num_heads,
self.in_proj_weight,
self.in_proj_bias,
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout,
self.out_proj.weight,
self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj_weight,
k_proj_weight=self.k_proj_weight,
v_proj_weight=self.v_proj_weight,
average_attn_weights=average_attn_weights,
cache=cache,
)
else:
attn_output, attn_output_weights = F.multi_head_attention_forward(
query,
key,
value,
self.embed_dim,
self.num_heads,
self.in_proj_weight,
self.in_proj_bias,
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout,
self.out_proj.weight,
self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
average_attn_weights=average_attn_weights,
cache=cache,
)
if self.batch_first and is_batched:
return attn_output.transpose(1, 0), attn_output_weights
else:
return attn_output, attn_output_weights
================================================
FILE: GPT_SoVITS/AR/modules/activation_onnx.py
================================================
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
from typing import Optional, Tuple
import torch
from torch import Tensor
from torch.nn import Linear, Module
from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
from torch.nn.parameter import Parameter
from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched
class MultiheadAttention(Module):
__constants__ = ["batch_first"]
bias_k: Optional[torch.Tensor]
bias_v: Optional[torch.Tensor]
def __init__(
self,
embed_dim,
num_heads,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
kdim=None,
vdim=None,
batch_first=False,
linear1_cls=Linear,
linear2_cls=Linear,
device=None,
dtype=None,
) -> None:
factory_kwargs = {"device": device, "dtype": dtype}
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.batch_first = batch_first
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
if add_bias_kv:
self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
else:
self.bias_k = self.bias_v = None
if linear1_cls == Linear:
if not self._qkv_same_embed_dim:
self.q_proj_weight = Parameter(
torch.empty(
(embed_dim, embed_dim),
**factory_kwargs,
)
)
self.k_proj_weight = Parameter(
torch.empty(
(embed_dim, self.kdim),
**factory_kwargs,
)
)
self.v_proj_weight = Parameter(
torch.empty(
(embed_dim, self.vdim),
**factory_kwargs,
)
)
self.register_parameter("in_proj_weight", None)
else:
self.in_proj_weight = Parameter(
torch.empty(
(3 * embed_dim, embed_dim),
**factory_kwargs,
)
)
self.register_parameter("q_proj_weight", None)
self.register_parameter("k_proj_weight", None)
self.register_parameter("v_proj_weight", None)
if bias:
self.in_proj_bias = Parameter(
torch.empty(3 * embed_dim, **factory_kwargs),
)
else:
self.register_parameter("in_proj_bias", None)
self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
self._reset_parameters()
else:
if not self._qkv_same_embed_dim:
raise NotImplementedError
else:
self.in_proj_linear = linear1_cls(
embed_dim,
3 * embed_dim,
bias=bias,
**factory_kwargs,
)
self.in_proj_weight = self.in_proj_linear.weight
self.register_parameter("q_proj_weight", None)
self.register_parameter("k_proj_weight", None)
self.register_parameter("v_proj_weight", None)
if bias:
self.in_proj_bias = self.in_proj_linear.bias
else:
self.register_parameter("in_proj_bias", None)
self.out_proj = linear2_cls(
embed_dim,
embed_dim,
bias=bias,
**factory_kwargs,
)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
self.add_zero_attn = add_zero_attn
def _reset_parameters(self):
if self._qkv_same_embed_dim:
xavier_uniform_(self.in_proj_weight)
else:
xavier_uniform_(self.q_proj_weight)
xavier_uniform_(self.k_proj_weight)
xavier_uniform_(self.v_proj_weight)
if self.in_proj_bias is not None:
constant_(self.in_proj_bias, 0.0)
constant_(self.out_proj.bias, 0.0)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
def __setstate__(self, state):
# Support loading old MultiheadAttention checkpoints generated by v1.1.0
if "_qkv_same_embed_dim" not in state:
state["_qkv_same_embed_dim"] = True
super(MultiheadAttention, self).__setstate__(state)
def forward(
self,
query: Tensor,
key: Tensor,
value: Tensor,
key_padding_mask: Optional[Tensor] = None,
need_weights: bool = True,
attn_mask: Optional[Tensor] = None,
average_attn_weights: bool = True,
cache=None,
) -> Tuple[Tensor, Optional[Tensor]]:
any_nested = query.is_nested or key.is_nested or value.is_nested
query = key = value = query.transpose(1, 0)
attn_output = multi_head_attention_forward_patched(
query,
key,
value,
self.embed_dim,
self.num_heads,
self.in_proj_weight,
self.in_proj_bias,
self.bias_k,
self.bias_v,
self.add_zero_attn,
self.dropout,
self.out_proj.weight,
self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
average_attn_weights=average_attn_weights,
cache=cache,
)
return attn_output.transpose(1, 0)
================================================
FILE: GPT_SoVITS/AR/modules/embedding.py
================================================
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
import math
import torch
from torch import nn
class TokenEmbedding(nn.Module):
def __init__(
self,
embedding_dim: int,
vocab_size: int,
dropout: float = 0.0,
):
super().__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dropout = torch.nn.Dropout(p=dropout)
self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
@property
def weight(self) -> torch.Tensor:
return self.word_embeddings.weight
def embedding(self, index: int) -> torch.Tensor:
return self.word_embeddings.weight[index : index + 1]
def forward(self, x: torch.Tensor):
x = self.word_embeddings(x)
x = self.dropout(x)
return x
class SinePositionalEmbedding(nn.Module):
def __init__(
self,
embedding_dim: int,
dropout: float = 0.0,
scale: bool = False,
alpha: bool = False,
):
super().__init__()
self.embedding_dim = embedding_dim
self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
self.dropout = torch.nn.Dropout(p=dropout)
self.reverse = False
self.pe = None
self.extend_pe(torch.tensor(0.0).expand(1, 4000))
def extend_pe(self, x):
"""Reset the positional encodings."""
if self.pe is not None:
if self.pe.size(1) >= x.size(1):
if self.pe.dtype != x.dtype or self.pe.device != x.device:
self.pe = self.pe.to(dtype=x.dtype, device=x.device)
return
pe = torch.zeros(x.size(1), self.embedding_dim)
if self.reverse:
position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
else:
position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
def forward(self, x: torch.Tensor) -> torch.Tensor:
self.extend_pe(x)
output = x.unsqueeze(-1) if x.ndim == 2 else x
output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
return self.dropout(output)
================================================
FILE: GPT_SoVITS/AR/modules/embedding_onnx.py
================================================
# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py
import math
import torch
from torch import nn
class TokenEmbedding(nn.Module):
def __init__(
self,
embedding_dim: int,
vocab_size: int,
dropout: float = 0.0,
):
super().__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.dropout = torch.nn.Dropout(p=dropout)
self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
@property
def weight(self) -> torch.Tensor:
return self.word_embeddings.weight
def embedding(self, index: int) -> torch.Tensor:
return self.word_embeddings.weight[index : index + 1]
def forward(self, x: torch.Tensor):
x = self.word_embeddings(x)
x = self.dropout(x)
return x
class SinePositionalEmbedding(nn.Module):
def __init__(
self,
embedding_dim: int,
dropout: float = 0.0,
scale: bool = False,
alpha: bool = False,
):
super().__init__()
self.embedding_dim = embedding_dim
self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
self.dropout = torch.nn.Dropout(p=dropout)
self.reverse = False
self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))
def extend_pe(self, x):
position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1)
scpe = (position * self.div_term).unsqueeze(0)
pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
pe = pe.contiguous().view(1, -1, self.embedding_dim)
return pe
def forward(self, x: torch.Tensor) -> torch.Tensor:
pe = self.extend_pe(x)
output = x.unsqueeze(-1) if x.ndim == 2 else x
output = output * self.x_scale + self.alpha * pe
return self.dropout(output)
================================================
FILE: GPT_SoVITS/AR/modules/lr_schedulers.py
================================================
# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py
# reference: https://github.com/lifeiteng/vall-e
import math
import torch
from matplotlib import pyplot as plt
from torch import nn
from torch.optim import Adam
class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
"""
Implements Warmup learning rate schedule until 'warmup_steps', going from 'init_lr' to 'peak_lr' for multiple optimizers.
"""
def __init__(
self,
optimizer,
init_lr,
peak_lr,
end_lr,
warmup_steps=10000,
total_steps=400000,
current_step=0,
):
self.init_lr = init_lr
self.peak_lr = peak_lr
self.end_lr = end_lr
self.optimizer = optimizer
self._warmup_rate = (peak_lr - init_lr) / warmup_steps
self._decay_rate = (end_lr - peak_lr) / (total_steps - warmup_steps)
self._current_step = current_step
self.lr = init_lr
self.warmup_steps = warmup_steps
self.total_steps = total_steps
self._last_lr = [self.lr]
def set_lr(self, lr):
self._last_lr = [g["lr"] for g in self.optimizer.param_groups]
for g in self.optimizer.param_groups:
# g['lr'] = lr
g["lr"] = self.end_lr ###锁定用线性
def step(self):
if self._current_step < self.warmup_steps:
lr = self.init_lr + self._warmup_rate * self._current_step
elif self._current_step > self.total_steps:
lr = self.end_lr
else:
decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
if decay_ratio < 0.0 or decay_ratio > 1.0:
raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.")
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)
self.lr = lr = self.end_lr = 0.002 ###锁定用线性###不听话,直接锁定!
self.set_lr(lr)
self.lr = lr
self._current_step += 1
return self.lr
if __name__ == "__main__":
m = nn.Linear(10, 10)
opt = Adam(m.parameters(), lr=1e-4)
s = WarmupCosineLRSchedule(
opt,
1e-6,
2e-4,
1e-6,
warmup_steps=2000,
total_steps=20000,
current_step=0,
)
lrs = []
for i in range(25000):
s.step()
lrs.append(s.lr)
print(s.lr)
plt.plot(lrs)
plt.plot(range(0, 25000), lrs)
plt.show()
================================================
FILE: GPT_SoVITS/AR/modules/optim.py
================================================
# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)
#
# See ../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import logging
from collections import defaultdict
from typing import List, Tuple
import torch
from torch import Tensor
from torch.optim import Optimizer
class BatchedOptimizer(Optimizer):
"""
This class adds to class Optimizer the capability to optimize parameters in batches:
it will stack the parameters and their grads for you so the optimizer can work
on tensors with an extra leading dimension. This is intended for speed with GPUs,
as it reduces the number of kernels launched in the optimizer.
Args:
params:
"""
def __init__(self, params, defaults):
super(BatchedOptimizer, self).__init__(params, defaults)
@contextlib.contextmanager
def batched_params(self, param_group, group_params_names):
"""
This function returns (technically, yields) a list of
of tuples (p, state), where
p is a `fake` parameter that is stacked (over axis 0) from real parameters
that share the same shape, and its gradient is also stacked;
`state` is the state corresponding to this batch of parameters
(it will be physically located in the "state" for one of the real
parameters, the last one that has any particular shape and dtype).
This function is decorated as a context manager so that it can
write parameters back to their "real" locations.
The idea is, instead of doing:
<code>
for p in group["params"]:
state = self.state[p]
...
</code>
you can do:
<code>
with self.batched_params(group["params"]) as batches:
for p, state, p_names in batches:
...
</code>
Args:
group: a parameter group, which is a list of parameters; should be
one of self.param_groups.
group_params_names: name for each parameter in group,
which is List[str].
"""
batches = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
batches_names = defaultdict(list) # `batches` maps from tuple (dtype_as_str,*shape) to list of str
assert len(param_group) == len(group_params_names)
for p, named_p in zip(param_group, group_params_names):
key = (str(p.dtype), *p.shape)
batches[key].append(p)
batches_names[key].append(named_p)
batches_names_keys = list(batches_names.keys())
sorted_idx = sorted(range(len(batches_names)), key=lambda i: batches_names_keys[i])
batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
stacked_params_dict = dict()
# turn batches into a list, in deterministic order.
# tuples will contain tuples of (stacked_param, state, stacked_params_names),
# one for each batch in `batches`.
tuples = []
for batch, batch_names in zip(batches, batches_names):
p = batch[0]
# we arbitrarily store the state in the
# state corresponding to the 1st parameter in the
# group. class Optimizer will take care of saving/loading state.
state = self.state[p]
p_stacked = torch.stack(batch)
grad = torch.stack([torch.zeros_like(p) if p.grad is None else p.grad for p in batch])
p_stacked.grad = grad
stacked_params_dict[key] = p_stacked
tuples.append((p_stacked, state, batch_names))
yield tuples # <-- calling code will do the actual optimization here!
for (stacked_params, _state, _names), batch in zip(tuples, batches):
for i, p in enumerate(batch): # batch is list of Parameter
p.copy_(stacked_params[i])
class ScaledAdam(BatchedOptimizer):
"""
Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
proportional to the norm of that parameter; and also learn the scale of the parameter,
in log space, subject to upper and lower limits (as if we had factored each parameter as
param = underlying_param * log_scale.exp())
Args:
params: The parameters or param_groups to optimize (like other Optimizer subclasses)
lr: The learning rate. We will typically use a learning rate schedule that starts
at 0.03 and decreases over time, i.e. much higher than other common
optimizers.
clipping_scale: (e.g. 2.0)
A scale for gradient-clipping: if specified, the normalized gradients
over the whole model will be clipped to have 2-norm equal to
`clipping_scale` times the median 2-norm over the most recent period
of `clipping_update_period` minibatches. By "normalized gradients",
we mean after multiplying by the rms parameter value for this tensor
[for non-scalars]; this is appropriate because our update is scaled
by this quantity.
betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
Must satisfy 0 < beta <= beta2 < 1.
scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
scale of each parameter tensor and scalar parameters of the mode..
If each parameter were decomposed
as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
would be a the scaling factor on the learning rate of p_scale.
eps: A general-purpose epsilon to prevent division by zero
param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
learning the scale on the parameters (we'll constrain the rms of each non-scalar
parameter tensor to be >= this value)
param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
learning the scale on the parameters (we'll constrain the rms of each non-scalar
parameter tensor to be <= this value)
scalar_max: Maximum absolute value for scalar parameters (applicable if your
model has any parameters with numel() == 1).
size_update_period: The periodicity, in steps, with which we update the size (scale)
of the parameter tensor. This is provided to save a little time
in the update.
clipping_update_period: if clipping_scale is specified, this is the period
"""
def __init__(
self,
params,
lr=3e-02,
clipping_scale=None,
betas=(0.9, 0.98),
scalar_lr_scale=0.1,
eps=1.0e-08,
param_min_rms=1.0e-05,
param_max_rms=3.0,
scalar_max=10.0,
size_update_period=4,
clipping_update_period=100,
parameters_names=None,
show_dominant_parameters=True,
):
assert parameters_names is not None, (
"Please prepare parameters_names,which is a List[List[str]]. Each List[str] is for a groupand each str is for a parameter"
)
defaults = dict(
lr=lr,
clipping_scale=clipping_scale,
betas=betas,
scalar_lr_scale=scalar_lr_scale,
eps=eps,
param_min_rms=param_min_rms,
param_max_rms=param_max_rms,
scalar_max=scalar_max,
size_update_period=size_update_period,
clipping_update_period=clipping_update_period,
)
super(ScaledAdam, self).__init__(params, defaults)
assert len(self.param_groups) == len(parameters_names)
self.parameters_names = parameters_names
self.show_dominant_parameters = show_dominant_parameters
def __setstate__(self, state):
super(ScaledAdam, self).__setstate__(state)
@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
batch = True
for group, group_params_names in zip(self.param_groups, self.parameters_names):
with self.batched_params(group["params"], group_params_names) as batches:
# batches is list of pairs (stacked_param, state). stacked_param is like
# a regular parameter, and will have a .grad, but the 1st dim corresponds to
# a stacking dim, it is not a real dim.
if len(batches[0][1]) == 0: # if len(first state) == 0: not yet initialized
clipping_scale = 1
else:
clipping_scale = self._get_clipping_scale(group, batches)
for p, state, _ in batches:
# Perform optimization step.
# grad is not going to be None, we handled that when creating the batches.
grad = p.grad
if grad.is_sparse:
raise RuntimeError("ScaledAdam optimizer does not support sparse gradients")
# State initialization
if len(state) == 0:
self._init_state(group, p, state)
self._step_one_batch(group, p, state, clipping_scale)
return loss
def _init_state(self, group: dict, p: Tensor, state: dict):
"""
Initializes state dict for parameter 'p'. Assumes that dim 0 of tensor p
is actually the batch dimension, corresponding to batched-together
parameters of a given shape.
Args:
group: Dict to look up configuration values.
p: The parameter that we are initializing the state for
state: Dict from string to whatever state we are initializing
"""
size_update_period = group["size_update_period"]
state["step"] = 0
kwargs = {"device": p.device, "dtype": p.dtype}
# 'delta' implements conventional momentum. There are
# several different kinds of update going on, so rather than
# compute "exp_avg" like in Adam, we store and decay a
# parameter-change "delta", which combines all forms of
# update. this is equivalent to how it's done in Adam,
# except for the first few steps.
state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
batch_size = p.shape[0]
numel = p.numel() // batch_size
numel = p.numel()
if numel > 1:
# "param_rms" just periodically records the scalar root-mean-square value of
# the parameter tensor.
# it has a shape like (batch_size, 1, 1, 1, 1)
param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
state["param_rms"] = param_rms
state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
state["scale_grads"] = torch.zeros(size_update_period, *param_rms.shape, **kwargs)
# exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]) -> float:
"""
Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
by this amount before applying the rest of the update.
Args:
group: the parameter group, an item in self.param_groups
tuples: a list of tuples of (param, state, param_names)
where param is a batched set of parameters,
with a .grad (1st dim is batch dim)
and state is the state-dict where optimization parameters are kept.
param_names is a List[str] while each str is name for a parameter
in batched set of parameters "param".
"""
assert len(tuples) >= 1
clipping_scale = group["clipping_scale"]
(first_p, first_state, _) = tuples[0]
step = first_state["step"]
if clipping_scale is None or step == 0:
# no clipping. return early on step == 0 because the other
# parameters' state won't have been initialized yet.
return 1.0
clipping_update_period = group["clipping_update_period"]
tot_sumsq = torch.tensor(0.0, device=first_p.device)
for p, state, param_names in tuples:
grad = p.grad
if grad.is_sparse:
raise RuntimeError("ScaledAdam optimizer does not support sparse gradients")
if p.numel() == p.shape[0]: # a batch of scalars
tot_sumsq += (grad**2).sum() # sum() to change shape [1] to []
else:
tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
tot_norm = tot_sumsq.sqrt()
if "model_norms" not in first_state:
first_state["model_norms"] = torch.zeros(clipping_update_period, device=p.device)
first_state["model_norms"][step % clipping_update_period] = tot_norm
if step % clipping_update_period == 0:
# Print some stats.
# We don't reach here if step == 0 because we would have returned
# above.
sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
quartiles = []
for n in range(0, 5):
index = min(
clipping_update_period - 1,
(clipping_update_period // 4) * n,
)
quartiles.append(sorted_norms[index].item())
median = quartiles[2]
threshold = clipping_scale * median
first_state["model_norm_threshold"] = threshold
percent_clipped = (
first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0
)
first_state["num_clipped"] = 0
quartiles = " ".join(["%.3e" % x for x in quartiles])
logging.info(
f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
)
if step < clipping_update_period:
return 1.0 # We have not yet estimated a norm to clip to.
else:
try:
model_norm_threshold = first_state["model_norm_threshold"]
except KeyError:
logging.info(
"Warning: model_norm_threshold not in state: possibly you changed config when restarting, adding clipping_scale option?"
)
return 1.0
ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
if ans < 1.0:
first_state["num_clipped"] += 1
if ans < 0.1:
logging.warning(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}")
if self.show_dominant_parameters:
assert p.shape[0] == len(param_names)
self._show_gradient_dominating_parameter(tuples, tot_sumsq)
return ans
def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor):
"""
Show information of parameter which dominating tot_sumsq.
Args:
tuples: a list of tuples of (param, state, param_names)
where param is a batched set of parameters,
with a .grad (1st dim is batch dim)
and state is the state-dict where optimization parameters are kept.
param_names is a List[str] while each str is name for a parameter
in batched set of parameters "param".
tot_sumsq: sumsq of all parameters. Though it's could be calculated
from tuples, we still pass it to save some time.
"""
all_sumsq_orig = {}
for p, state, batch_param_names in tuples:
# p is a stacked batch parameters.
batch_grad = p.grad
if p.numel() == p.shape[0]: # a batch of scalars
batch_sumsq_orig = batch_grad**2
# Dummpy values used by following `zip` statement.
batch_rms_orig = torch.ones(p.shape[0])
else:
batch_rms_orig = state["param_rms"]
batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(dim=list(range(1, batch_grad.ndim)))
for name, sumsq_orig, rms, grad in zip(
batch_param_names,
batch_sumsq_orig,
batch_rms_orig,
batch_grad,
):
proportion_orig = sumsq_orig / tot_sumsq
all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
assert torch.isclose(
sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
torch.tensor(1.0),
)
sorted_by_proportion = {
k: v
for k, v in sorted(
all_sumsq_orig.items(),
key=lambda item: item[1][0],
reverse=True,
)
}
dominant_param_name = next(iter(sorted_by_proportion))
(
dominant_proportion,
dominant_sumsq,
dominant_rms,
dominant_grad,
) = sorted_by_proportion[dominant_param_name]
logging.info(
f"Parameter Dominating tot_sumsq {dominant_param_name}"
f" with proportion {dominant_proportion:.2f},"
f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
f"={dominant_sumsq:.3e},"
f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
f" orig_rms_sq={(dominant_rms**2).item():.3e}"
)
def _step_one_batch(self, group: dict, p: Tensor, state: dict, clipping_scale: float):
"""
Do the step for one parameter, which is actually going to be a batch of
`real` parameters, with dim 0 as the batch dim.
Args:
group: dict to look up configuration values
p: parameter to update (actually multiple parameters stacked together
as a batch)
state: state-dict for p, to look up the optimizer state
"""
lr = group["lr"]
size_update_period = group["size_update_period"]
beta1 = group["betas"][0]
grad = p.grad
if clipping_scale != 1.0:
grad = grad * clipping_scale
step = state["step"]
delta = state["delta"]
delta.mul_(beta1)
batch_size = p.shape[0]
numel = p.numel() // batch_size
if numel > 1:
# Update the size/scale of p, and set param_rms
scale_grads = state["scale_grads"]
scale_grads[step % size_update_period] = (p * grad).sum(dim=list(range(1, p.ndim)), keepdim=True)
if step % size_update_period == size_update_period - 1:
param_rms = state["param_rms"] # shape: (batch_size, 1, 1, ..)
param_rms.copy_((p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt())
if step > 0:
# self._size_update() learns the overall scale on the
# parameter, by shrinking or expanding it.
self._size_update(group, scale_grads, p, state)
if numel == 1:
# For parameters with 1 element we just use regular Adam.
# Updates delta.
self._step_scalar(group, p, state)
else:
self._step(group, p, state)
state["step"] = step + 1
def _size_update(
self,
group: dict,
scale_grads: Tensor,
p: Tensor,
state: dict,
) -> None:
"""
Called only where p.numel() > 1, this updates the scale of the parameter.
If we imagine: p = underlying_param * scale.exp(), and we are doing
gradient descent on underlying param and on scale, this function does the update
on `scale`.
Args:
group: dict to look up configuration values
scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
grads w.r.t. the scales.
p: The parameter to update
state: The state-dict of p
"""
param_rms = state["param_rms"]
beta1, beta2 = group["betas"]
size_lr = group["lr"] * group["scalar_lr_scale"]
param_min_rms = group["param_min_rms"]
param_max_rms = group["param_max_rms"]
eps = group["eps"]
step = state["step"]
batch_size = p.shape[0]
size_update_period = scale_grads.shape[0]
# correct beta2 for the size update period: we will have
# faster decay at this level.
beta2_corr = beta2**size_update_period
scale_exp_avg_sq = state["scale_exp_avg_sq"] # shape: (batch_size, 1, 1, ..)
scale_exp_avg_sq.mul_(beta2_corr).add_(
(scale_grads**2).mean(dim=0), # mean over dim `size_update_period`
alpha=1 - beta2_corr,
) # shape is (batch_size, 1, 1, ...)
# The 1st time we reach here is when size_step == 1.
size_step = (step + 1) // size_update_period
bias_correction2 = 1 - beta2_corr**size_step
# we don't bother with bias_correction1; this will help prevent divergence
# at the start of training.
denom = scale_exp_avg_sq.sqrt() + eps
scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
is_too_small = param_rms < param_min_rms
is_too_large = param_rms > param_max_rms
# when the param gets too small, just don't shrink it any further.
scale_step.masked_fill_(is_too_small, 0.0)
# when it gets too large, stop it from getting any larger.
scale_step.masked_fill_(is_too_large, -size_lr * size_update_period)
delta = state["delta"]
# the factor of (1-beta1) relates to momentum.
delta.add_(p * scale_step, alpha=(1 - beta1))
def _step(self, group: dict, p: Tensor, state: dict):
"""
This function does the core update of self.step(), in the case where the members of
the batch have more than 1 element.
Args:
group: A dict which will be used to look up configuration values
p: The parameter to be updated
grad: The grad of p
state: The state-dict corresponding to parameter p
This function modifies p.
"""
grad = p.grad
lr = group["lr"]
beta1, beta2 = group["betas"]
eps = group["eps"]
param_min_rms = group["param_min_rms"]
step = state["step"]
exp_avg_sq = state["exp_avg_sq"]
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
bias_correction2 = 1 - beta2 ** (this_step + 1)
if bias_correction2 < 0.99:
# note: not in-place.
exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
denom = exp_avg_sq.sqrt()
denom += eps
grad = grad / denom
alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
delta = state["delta"]
delta.add_(grad * alpha)
p.add_(delta)
def _step_scalar(self, group: dict, p: Tensor, state: dict):
"""
A simplified form of the core update for scalar tensors, where we cannot get a good
estimate of the parameter rms.
"""
beta1, beta2 = group["betas"]
scalar_max = group["scalar_max"]
eps = group["eps"]
lr = group["lr"] * group["scalar_lr_scale"]
grad = p.grad
exp_avg_sq = state["exp_avg_sq"] # shape: (batch_size,)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
# bias_correction2 is like in Adam. Don't bother with bias_correction1;
# slower update at the start will help stability anyway.
bias_correction2 = 1 - beta2 ** (state["step"] + 1)
denom = (exp_avg_sq / bias_correction2).sqrt() + eps
delta = state["delta"]
delta.add_(grad / denom, alpha=-lr * (1 - beta1))
p.clamp_(min=-scalar_max, max=scalar_max)
p.add_(delta)
================================================
FILE: GPT_SoVITS/AR/modules/patched_mha_with_cache.py
================================================
from torch.nn.functional import *
from torch.nn.functional import (
_mha_shape_check,
_canonical_mask,
_none_or_dtype,
_in_projection_packed,
)
import torch
# Tensor = torch.Tensor
# from typing import Callable, List, Optional, Tuple, Union
def multi_head_attention_forward_patched(
query,
key,
value,
embed_dim_to_check,
num_heads,
in_proj_weight,
in_proj_bias,
bias_k,
bias_v,
add_zero_attn,
dropout_p: float,
out_proj_weight,
out_proj_bias,
training=True,
key_padding_mask=None,
need_weights=True,
attn_mask=None,
use_separate_proj_weight=False,
q_proj_weight=None,
k_proj_weight=None,
v_proj_weight=None,
static_k=None,
static_v=None,
average_attn_weights=True,
is_causal=False,
cache=None,
):
r"""
Args:
query, key, value: map a query and a set of key-value pairs to an output.
See "Attention Is All You Need" for more details.
embed_dim_to_check: total dimension of the model.
num_heads: parallel attention heads.
in_proj_weight, in_proj_bias: input projection weight and bias.
bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
add_zero_attn: add a new batch of zeros to the key and
value sequences at dim=1.
dropout_p: probability of an element to be zeroed.
out_proj_weight, out_proj_bias: the output projection weight and bias.
training: apply dropout if is ``True``.
key_padding_mask: if provided, specified padding elements in the key will
be ignored by the attention. This is an binary mask. When the value is True,
the corresponding value on the attention layer will be filled with -inf.
need_weights: output attn_output_weights.
Default: `True`
Note: `needs_weight` defaults to `True`, but should be set to `False`
For best performance when attention weights are not nedeeded.
*Setting needs_weights to `True`
leads to a significant performance degradation.*
attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
the batches while a 3D mask allows to specify a different mask for the entries of each batch.
is_causal: If specified, applies a causal mask as attention mask, and ignores
attn_mask for computing scaled dot product attention.
Default: ``False``.
.. warning::
is_causal is provides a hint that the attn_mask is the
causal mask.Providing incorrect hints can result in
incorrect execution, including forward and backward
compatibility.
use_separate_proj_weight: the function accept the proj. weights for query, key,
and value in different forms. If false, in_proj_weight will be used, which is
a combination of q_proj_weight, k_proj_weight, v_proj_weight.
q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
static_k, static_v: static key and value used for attention operators.
average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads.
Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect
when ``need_weights=True.``. Default: True
Shape:
Inputs:
- query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
the embedding dimension.
- key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length.
If a FloatTensor is provided, it will be directly added to the value.
If a BoolTensor is provided, the positions with the
value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
- attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
positions. If a BoolTensor is provided, positions with ``True``
are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
is provided, it will be added to the attention weight.
- static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
- static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
Outputs:
- attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
E is the embedding dimension.
- attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns
attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
:math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
:math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`.
"""
tens_ops = (
query,
key,
value,
in_proj_weight,
in_proj_bias,
bias_k,
bias_v,
out_proj_weight,
out_proj_bias,
)
if has_torch_function(tens_ops):
return handle_torch_function(
multi_head_attention_forward,
tens_ops,
query,
key,
value,
embed_dim_to_check,
num_heads,
in_proj_weight,
in_proj_bias,
bias_k,
bias_v,
add_zero_attn,
dropout_p,
out_proj_weight,
out_proj_bias,
training=training,
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
is_causal=is_causal,
use_separate_proj_weight=use_separate_proj_weight,
q_proj_weight=q_proj_weight,
k_proj_weight=k_proj_weight,
v_proj_weight=v_proj_weight,
static_k=static_k,
static_v=static_v,
average_attn_weights=average_attn_weights,
cache=cache,
)
is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
# For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
# is batched, run the computation and before returning squeeze the
# batch dimension so that the output doesn't carry this temporary batch dimension.
if not is_batched:
# unsqueeze if the input is unbatched
query = query.unsqueeze(1)
key = key.unsqueeze(1)
value = value.unsqueeze(1)
if key_padding_mask is not None:
key_padding_mask = key_padding_mask.unsqueeze(0)
# set up shape vars
tgt_len, bsz, embed_dim = query.shape
src_len, _, _ = key.shape
key_padding_mask = _canonical_mask(
mask=key_padding_mask,
mask_name="key_padding_mask",
other_type=_none_or_dtype(attn_mask),
other_name="attn_mask",
target_type=query.dtype,
)
if is_causal and attn_mask is None:
raise RuntimeError(
"Need attn_mask if specifying the is_causal hint. "
"You may use the Transformer module method "
"`generate_square_subsequent_mask` to create this mask."
)
if is_causal and key_padding_mask is None and not need_weights:
# when we have a kpm or need weights, we need attn_mask
# Otherwise, we use the is_causal hint go as is_causal
# indicator to SDPA.
attn_mask = None
else:
attn_mask = _canonical_mask(
mask=attn_mask,
mask_name="attn_mask",
other_type=None,
other_name="",
target_type=query.dtype,
check_other=False,
)
if key_padding_mask is not None:
# We have the attn_mask, and use that to merge kpm into it.
# Turn off use of is_causal hint, as the merged mask is no
# longer causal.
is_causal = False
assert embed_dim == embed_dim_to_check, (
f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
)
if isinstance(embed_dim, torch.Tensor):
# embed_dim can be a tensor when JIT tracing
head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
else:
head_dim = embed_dim // num_heads
assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
if use_separate_proj_weight:
# allow MHA to have different embedding dimensions when separate projection weights are used
assert key.shape[:2] == value.shape[:2], (
f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
)
else:
assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
#
# compute in-projection
#
if not use_separate_proj_weight:
assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
else:
assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
if in_proj_bias is None:
b_q = b_k = b_v = None
else:
b_q, b_k, b_v = in_proj_bias.chunk(3)
q, k, v = _in_projection(
query,
key,
value,
q_proj_weight,
k_proj_weight,
v_proj_weight,
b_q,
b_k,
b_v,
)
if cache != None:
if cache["first_infer"] == 1:
cache["k"][cache["stage"]] = k
# print(0,cache["k"].shape)
cache["v"][cache["stage"]] = v
else: ###12个layer每个都要留自己的cache_kv
# print(1,cache["k"].shape)
cac
gitextract_whvdqko7/ ├── .dockerignore ├── .github/ │ ├── build_windows_packages.ps1 │ └── workflows/ │ ├── build_windows_packages.yaml │ └── docker-publish.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── Colab-Inference.ipynb ├── Colab-WebUI.ipynb ├── Docker/ │ ├── install_wrapper.sh │ └── miniforge_install.sh ├── Dockerfile ├── GPT_SoVITS/ │ ├── AR/ │ │ ├── __init__.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── bucket_sampler.py │ │ │ ├── data_module.py │ │ │ └── dataset.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── t2s_lightning_module.py │ │ │ ├── t2s_lightning_module_onnx.py │ │ │ ├── t2s_model.py │ │ │ ├── t2s_model_onnx.py │ │ │ └── utils.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── activation_onnx.py │ │ │ ├── embedding.py │ │ │ ├── embedding_onnx.py │ │ │ ├── lr_schedulers.py │ │ │ ├── optim.py │ │ │ ├── patched_mha_with_cache.py │ │ │ ├── patched_mha_with_cache_onnx.py │ │ │ ├── scaling.py │ │ │ ├── transformer.py │ │ │ └── transformer_onnx.py │ │ ├── text_processing/ │ │ │ ├── __init__.py │ │ │ ├── phonemizer.py │ │ │ └── symbols.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── initialize.py │ │ └── io.py │ ├── BigVGAN/ │ │ ├── LICENSE │ │ ├── README.md │ │ ├── activations.py │ │ ├── alias_free_activation/ │ │ │ ├── cuda/ │ │ │ │ ├── __init__.py │ │ │ │ ├── activation1d.py │ │ │ │ ├── anti_alias_activation.cpp │ │ │ │ ├── anti_alias_activation_cuda.cu │ │ │ │ ├── compat.h │ │ │ │ ├── load.py │ │ │ │ └── type_shim.h │ │ │ └── torch/ │ │ │ ├── __init__.py │ │ │ ├── act.py │ │ │ ├── filter.py │ │ │ └── resample.py │ │ ├── bigvgan.py │ │ ├── configs/ │ │ │ ├── bigvgan_22khz_80band.json │ │ │ ├── bigvgan_24khz_100band.json │ │ │ ├── bigvgan_base_22khz_80band.json │ │ │ ├── bigvgan_base_24khz_100band.json │ │ │ ├── bigvgan_v2_22khz_80band_256x.json │ │ │ ├── bigvgan_v2_22khz_80band_fmax8k_256x.json │ │ │ ├── bigvgan_v2_24khz_100band_256x.json │ │ │ ├── bigvgan_v2_44khz_128band_256x.json │ │ │ └── bigvgan_v2_44khz_128band_512x.json │ │ ├── discriminators.py │ │ ├── env.py │ │ ├── incl_licenses/ │ │ │ ├── LICENSE_1 │ │ │ ├── LICENSE_2 │ │ │ ├── LICENSE_3 │ │ │ ├── LICENSE_4 │ │ │ ├── LICENSE_5 │ │ │ ├── LICENSE_6 │ │ │ ├── LICENSE_7 │ │ │ └── LICENSE_8 │ │ ├── inference.py │ │ ├── inference_e2e.py │ │ ├── loss.py │ │ ├── meldataset.py │ │ ├── nv-modelcard++/ │ │ │ ├── .gitkeep │ │ │ ├── bias.md │ │ │ ├── explainability.md │ │ │ ├── overview.md │ │ │ ├── privacy.md │ │ │ └── safety.md │ │ ├── requirements.txt │ │ ├── tests/ │ │ │ ├── test_activation.py │ │ │ ├── test_activation_snake_beta.py │ │ │ └── test_cuda_vs_torch_model.py │ │ ├── train.py │ │ └── utils0.py │ ├── TTS_infer_pack/ │ │ ├── TTS.py │ │ ├── TextPreprocessor.py │ │ ├── __init__.py │ │ └── text_segmentation_method.py │ ├── configs/ │ │ ├── .gitignore │ │ ├── s2.json │ │ ├── s2v2Pro.json │ │ └── s2v2ProPlus.json │ ├── download.py │ ├── eres2net/ │ │ ├── ERes2Net.py │ │ ├── ERes2NetV2.py │ │ ├── ERes2Net_huge.py │ │ ├── fusion.py │ │ ├── kaldi.py │ │ └── pooling_layers.py │ ├── export_torch_script.py │ ├── export_torch_script_v3v4.py │ ├── f5_tts/ │ │ └── model/ │ │ ├── __init__.py │ │ ├── backbones/ │ │ │ ├── README.md │ │ │ ├── dit.py │ │ │ ├── mmdit.py │ │ │ └── unett.py │ │ └── modules.py │ ├── feature_extractor/ │ │ ├── __init__.py │ │ ├── cnhubert.py │ │ └── whisper_enc.py │ ├── inference_cli.py │ ├── inference_gui.py │ ├── inference_webui.py │ ├── inference_webui_fast.py │ ├── module/ │ │ ├── __init__.py │ │ ├── attentions.py │ │ ├── attentions_onnx.py │ │ ├── commons.py │ │ ├── core_vq.py │ │ ├── data_utils.py │ │ ├── ddp_utils.py │ │ ├── distrib.py │ │ ├── losses.py │ │ ├── mel_processing.py │ │ ├── models.py │ │ ├── models_onnx.py │ │ ├── modules.py │ │ ├── mrte_model.py │ │ ├── quantize.py │ │ └── transforms.py │ ├── onnx_export.py │ ├── prepare_datasets/ │ │ ├── 1-get-text.py │ │ ├── 2-get-hubert-wav32k.py │ │ ├── 2-get-sv.py │ │ └── 3-get-semantic.py │ ├── pretrained_models/ │ │ └── .gitignore │ ├── process_ckpt.py │ ├── s1_train.py │ ├── s2_train.py │ ├── s2_train_v3.py │ ├── s2_train_v3_lora.py │ ├── stream_v2pro.py │ ├── sv.py │ ├── text/ │ │ ├── .gitignore │ │ ├── LangSegmenter/ │ │ │ ├── __init__.py │ │ │ └── langsegmenter.py │ │ ├── __init__.py │ │ ├── cantonese.py │ │ ├── chinese.py │ │ ├── chinese2.py │ │ ├── cleaner.py │ │ ├── cmudict-fast.rep │ │ ├── cmudict.rep │ │ ├── en_normalization/ │ │ │ └── expend.py │ │ ├── engdict-hot.rep │ │ ├── engdict_cache.pickle │ │ ├── english.py │ │ ├── g2pw/ │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── g2pw.py │ │ │ ├── onnx_api.py │ │ │ ├── polyphonic-fix.rep │ │ │ ├── polyphonic.pickle │ │ │ ├── polyphonic.rep │ │ │ └── utils.py │ │ ├── ja_userdic/ │ │ │ └── userdict.csv │ │ ├── japanese.py │ │ ├── korean.py │ │ ├── namedict_cache.pickle │ │ ├── opencpop-strict.txt │ │ ├── symbols.py │ │ ├── symbols2.py │ │ ├── tone_sandhi.py │ │ └── zh_normalization/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── char_convert.py │ │ ├── chronology.py │ │ ├── constants.py │ │ ├── num.py │ │ ├── phonecode.py │ │ ├── quantifier.py │ │ └── text_normlization.py │ └── utils.py ├── LICENSE ├── README.md ├── api.py ├── api_v2.py ├── config.py ├── docker-compose.yaml ├── docker_build.sh ├── docs/ │ ├── cn/ │ │ ├── Changelog_CN.md │ │ └── README.md │ ├── en/ │ │ └── Changelog_EN.md │ ├── ja/ │ │ ├── Changelog_JA.md │ │ └── README.md │ ├── ko/ │ │ ├── Changelog_KO.md │ │ └── README.md │ └── tr/ │ ├── Changelog_TR.md │ └── README.md ├── extra-req.txt ├── go-webui.bat ├── go-webui.ps1 ├── install.ps1 ├── install.sh ├── requirements.txt ├── tools/ │ ├── AP_BWE_main/ │ │ ├── 24kto48k/ │ │ │ └── readme.txt │ │ ├── LICENSE │ │ ├── README.md │ │ ├── datasets1/ │ │ │ ├── __init__.py │ │ │ └── dataset.py │ │ └── models/ │ │ ├── __init__.py │ │ └── model.py │ ├── __init__.py │ ├── asr/ │ │ ├── config.py │ │ ├── fasterwhisper_asr.py │ │ ├── funasr_asr.py │ │ └── models/ │ │ └── .gitignore │ ├── assets.py │ ├── audio_sr.py │ ├── cmd-denoise.py │ ├── denoise-model/ │ │ └── .gitignore │ ├── i18n/ │ │ ├── i18n.py │ │ ├── locale/ │ │ │ ├── en_US.json │ │ │ ├── es_ES.json │ │ │ ├── fr_FR.json │ │ │ ├── it_IT.json │ │ │ ├── ja_JP.json │ │ │ ├── ko_KR.json │ │ │ ├── pt_BR.json │ │ │ ├── ru_RU.json │ │ │ ├── tr_TR.json │ │ │ ├── zh_CN.json │ │ │ ├── zh_HK.json │ │ │ ├── zh_SG.json │ │ │ └── zh_TW.json │ │ └── scan_i18n.py │ ├── my_utils.py │ ├── slice_audio.py │ ├── slicer2.py │ ├── subfix_webui.py │ └── uvr5/ │ ├── bs_roformer/ │ │ ├── __init__.py │ │ ├── attend.py │ │ ├── bs_roformer.py │ │ └── mel_band_roformer.py │ ├── bsroformer.py │ ├── mdxnet.py │ ├── uvr5_weights/ │ │ └── .gitignore │ ├── vr.py │ └── webui.py └── webui.py
SYMBOL INDEX (1694 symbols across 136 files)
FILE: GPT_SoVITS/AR/data/bucket_sampler.py
class DistributedBucketSampler (line 20) | class DistributedBucketSampler(Sampler[T_co]):
method __init__ (line 29) | def __init__(
method _get_sample_lengths (line 76) | def _get_sample_lengths(self):
method make_buckets (line 83) | def make_buckets(self, bucket_width: float = 2.0):
method __iter__ (line 98) | def __iter__(self) -> Iterator[T_co]:
method __len__ (line 137) | def __len__(self) -> int:
method set_epoch (line 140) | def set_epoch(self, epoch: int) -> None:
FILE: GPT_SoVITS/AR/data/data_module.py
class Text2SemanticDataModule (line 10) | class Text2SemanticDataModule(LightningDataModule):
method __init__ (line 11) | def __init__(
method prepare_data (line 27) | def prepare_data(self):
method setup (line 30) | def setup(self, stage=None, output_logs=False):
method train_dataloader (line 45) | def train_dataloader(self):
method val_dataloader (line 63) | def val_dataloader(self):
method test_dataloader (line 75) | def test_dataloader(self):
FILE: GPT_SoVITS/AR/data/dataset.py
function batch_sequences (line 21) | def batch_sequences(sequences: List[np.array], axis: int = 0, pad_value:...
class Text2SemanticDataset (line 40) | class Text2SemanticDataset(Dataset):
method __init__ (line 43) | def __init__(
method init_batch (line 115) | def init_batch(self):
method __get_item_names__ (line 208) | def __get_item_names__(self) -> List[str]:
method __len__ (line 211) | def __len__(self) -> int:
method __getitem__ (line 214) | def __getitem__(self, idx: int) -> Dict:
method get_sample_length (line 241) | def get_sample_length(self, idx: int):
method collate (line 246) | def collate(self, examples: List[Dict]) -> Dict:
FILE: GPT_SoVITS/AR/models/t2s_lightning_module.py
class Text2SemanticLightningModule (line 18) | class Text2SemanticLightningModule(LightningModule):
method __init__ (line 19) | def __init__(self, config, output_dir, is_train=True):
method training_step (line 42) | def training_step(self, batch: Dict, batch_idx: int):
method validation_step (line 83) | def validation_step(self, batch: Dict, batch_idx: int):
method configure_optimizers (line 120) | def configure_optimizers(self):
FILE: GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py
class Text2SemanticLightningModule (line 18) | class Text2SemanticLightningModule(LightningModule):
method __init__ (line 19) | def __init__(self, config, output_dir, is_train=True):
method training_step (line 41) | def training_step(self, batch: Dict, batch_idx: int):
method validation_step (line 81) | def validation_step(self, batch: Dict, batch_idx: int):
method configure_optimizers (line 84) | def configure_optimizers(self):
FILE: GPT_SoVITS/AR/models/t2s_model.py
function scaled_dot_product_attention (line 39) | def scaled_dot_product_attention(
class T2SMLP (line 74) | class T2SMLP:
method __init__ (line 75) | def __init__(self, w1, b1, w2, b2):
method forward (line 81) | def forward(self, x):
class T2SBlock (line 88) | class T2SBlock:
method __init__ (line 89) | def __init__(
method to_mask (line 122) | def to_mask(
method process_prompt (line 135) | def process_prompt(
method decode_next_token (line 176) | def decode_next_token(
class T2STransformer (line 225) | class T2STransformer:
method __init__ (line 226) | def __init__(self, num_blocks: int, blocks: List[T2SBlock]):
method process_prompt (line 230) | def process_prompt(
method decode_next_token (line 245) | def decode_next_token(
class Text2SemanticDecoder (line 260) | class Text2SemanticDecoder(nn.Module):
method __init__ (line 261) | def __init__(self, config, norm_first=False, top_k=3):
method make_input_data (line 355) | def make_input_data(self, x, x_lens, y, y_lens, bert_feature):
method forward (line 408) | def forward(self, x, x_lens, y, y_lens, bert_feature):
method forward_old (line 450) | def forward_old(self, x, x_lens, y, y_lens, bert_feature):
method infer (line 513) | def infer(
method pad_y_eos (line 578) | def pad_y_eos(self, y, y_mask_int, eos_id):
method infer_panel_batch_infer (line 583) | def infer_panel_batch_infer(
method infer_panel_naive_batched (line 783) | def infer_panel_naive_batched(
method infer_panel_naive (line 816) | def infer_panel_naive(
method infer_panel (line 966) | def infer_panel(
FILE: GPT_SoVITS/AR/models/t2s_model_onnx.py
function logits_to_probs (line 26) | def logits_to_probs(
function multinomial_sample_one_no_sync (line 74) | def multinomial_sample_one_no_sync(
function sample (line 81) | def sample(
class OnnxEncoder (line 95) | class OnnxEncoder(nn.Module):
method __init__ (line 96) | def __init__(self, ar_text_embedding, bert_proj, ar_text_position):
method forward (line 102) | def forward(self, x, bert_feature):
class T2SFirstStageDecoder (line 108) | class T2SFirstStageDecoder(nn.Module):
method __init__ (line 109) | def __init__(
method forward (line 132) | def forward(self, x, prompt):
class T2SStageDecoder (line 189) | class T2SStageDecoder(nn.Module):
method __init__ (line 190) | def __init__(
method forward (line 213) | def forward(self, y, k, v, y_emb, x_example):
class Text2SemanticDecoder (line 249) | class Text2SemanticDecoder(nn.Module):
method __init__ (line 250) | def __init__(self, config, norm_first=False, top_k=3):
method init_onnx (line 292) | def init_onnx(self):
method forward (line 317) | def forward(self, x, prompts, bert_feature):
method infer (line 337) | def infer(self, x, prompts, bert_feature):
FILE: GPT_SoVITS/AR/models/utils.py
function sequence_mask (line 9) | def sequence_mask(length, max_length=None):
function make_pad_mask (line 16) | def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
function make_pad_mask_left (line 44) | def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch...
function top_k_top_p_filtering (line 78) | def top_k_top_p_filtering(
function topk_sampling (line 119) | def topk_sampling(logits, top_k=10, top_p=1.0, temperature=1.0):
function multinomial_sample_one_no_sync (line 140) | def multinomial_sample_one_no_sync(
function logits_to_probs (line 147) | def logits_to_probs(
function sample (line 192) | def sample(
function dpo_loss (line 202) | def dpo_loss(
function get_batch_logps (line 225) | def get_batch_logps(
function make_reject_y (line 244) | def make_reject_y(y_o, y_lens):
FILE: GPT_SoVITS/AR/modules/activation.py
class MultiheadAttention (line 17) | class MultiheadAttention(Module):
method __init__ (line 78) | def __init__(
method _reset_parameters (line 180) | def _reset_parameters(self):
method __setstate__ (line 197) | def __setstate__(self, state):
method forward (line 204) | def forward(
FILE: GPT_SoVITS/AR/modules/activation_onnx.py
class MultiheadAttention (line 14) | class MultiheadAttention(Module):
method __init__ (line 19) | def __init__(
method _reset_parameters (line 130) | def _reset_parameters(self):
method __setstate__ (line 147) | def __setstate__(self, state):
method forward (line 154) | def forward(
FILE: GPT_SoVITS/AR/modules/embedding.py
class TokenEmbedding (line 8) | class TokenEmbedding(nn.Module):
method __init__ (line 9) | def __init__(
method weight (line 24) | def weight(self) -> torch.Tensor:
method embedding (line 27) | def embedding(self, index: int) -> torch.Tensor:
method forward (line 30) | def forward(self, x: torch.Tensor):
class SinePositionalEmbedding (line 36) | class SinePositionalEmbedding(nn.Module):
method __init__ (line 37) | def __init__(
method extend_pe (line 54) | def extend_pe(self, x):
method forward (line 74) | def forward(self, x: torch.Tensor) -> torch.Tensor:
FILE: GPT_SoVITS/AR/modules/embedding_onnx.py
class TokenEmbedding (line 8) | class TokenEmbedding(nn.Module):
method __init__ (line 9) | def __init__(
method weight (line 24) | def weight(self) -> torch.Tensor:
method embedding (line 27) | def embedding(self, index: int) -> torch.Tensor:
method forward (line 30) | def forward(self, x: torch.Tensor):
class SinePositionalEmbedding (line 36) | class SinePositionalEmbedding(nn.Module):
method __init__ (line 37) | def __init__(
method extend_pe (line 52) | def extend_pe(self, x):
method forward (line 59) | def forward(self, x: torch.Tensor) -> torch.Tensor:
FILE: GPT_SoVITS/AR/modules/lr_schedulers.py
class WarmupCosineLRSchedule (line 11) | class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
method __init__ (line 16) | def __init__(
method set_lr (line 38) | def set_lr(self, lr):
method step (line 44) | def step(self):
FILE: GPT_SoVITS/AR/modules/optim.py
class BatchedOptimizer (line 26) | class BatchedOptimizer(Optimizer):
method __init__ (line 37) | def __init__(self, params, defaults):
method batched_params (line 41) | def batched_params(self, param_group, group_params_names):
class ScaledAdam (line 113) | class ScaledAdam(BatchedOptimizer):
method __init__ (line 156) | def __init__(
method __setstate__ (line 193) | def __setstate__(self, state):
method step (line 197) | def step(self, closure=None):
method _init_state (line 236) | def _init_state(self, group: dict, p: Tensor, state: dict):
method _get_clipping_scale (line 279) | def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, ...
method _show_gradient_dominating_parameter (line 363) | def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tenso...
method _step_one_batch (line 426) | def _step_one_batch(self, group: dict, p: Tensor, state: dict, clippin...
method _size_update (line 470) | def _size_update(
method _step (line 532) | def _step(self, group: dict, p: Tensor, state: dict):
method _step_scalar (line 571) | def _step_scalar(self, group: dict, p: Tensor, state: dict):
FILE: GPT_SoVITS/AR/modules/patched_mha_with_cache.py
function multi_head_attention_forward_patched (line 13) | def multi_head_attention_forward_patched(
FILE: GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
function multi_head_attention_forward_patched (line 7) | def multi_head_attention_forward_patched(
FILE: GPT_SoVITS/AR/modules/scaling.py
class DoubleSwishFunction (line 25) | class DoubleSwishFunction(torch.autograd.Function):
method forward (line 42) | def forward(ctx, x: Tensor) -> Tensor:
method backward (line 73) | def backward(ctx, y_grad: Tensor) -> Tensor:
class DoubleSwish (line 82) | class DoubleSwish(torch.nn.Module):
method forward (line 83) | def forward(self, x: Tensor) -> Tensor:
class ActivationBalancerFunction (line 92) | class ActivationBalancerFunction(torch.autograd.Function):
method forward (line 94) | def forward(
method backward (line 112) | def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None]:
function _compute_scale_factor (line 133) | def _compute_scale_factor(
function _compute_sign_factor (line 158) | def _compute_sign_factor(
class ActivationBalancer (line 191) | class ActivationBalancer(torch.nn.Module):
method __init__ (line 231) | def __init__(
method forward (line 262) | def forward(self, x: Tensor) -> Tensor:
function BalancedDoubleSwish (line 312) | def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, min_prob=...
FILE: GPT_SoVITS/AR/modules/transformer.py
class LayerNorm (line 22) | class LayerNorm(nn.Module):
method __init__ (line 28) | def __init__(
method reset_parameters (line 53) | def reset_parameters(self) -> None:
method forward (line 58) | def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
method extra_repr (line 75) | def extra_repr(self) -> str:
class IdentityNorm (line 79) | class IdentityNorm(nn.Module):
method __init__ (line 80) | def __init__(
method forward (line 89) | def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
class TransformerEncoder (line 97) | class TransformerEncoder(nn.Module):
method __init__ (line 118) | def __init__(self, encoder_layer, num_layers, norm=None):
method forward (line 124) | def forward(
class TransformerEncoderLayer (line 175) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 178) | def __init__(
method __setstate__ (line 252) | def __setstate__(self, state):
method forward (line 257) | def forward(
method _sa_block (line 305) | def _sa_block(
method _ff_block (line 328) | def _ff_block(self, x: Tensor) -> Tensor:
class AdaptiveLayerNorm (line 333) | class AdaptiveLayerNorm(nn.Module):
method __init__ (line 336) | def __init__(self, d_model, norm) -> None:
method forward (line 343) | def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
function _get_clones (line 361) | def _get_clones(module, N):
FILE: GPT_SoVITS/AR/modules/transformer_onnx.py
class LayerNorm (line 22) | class LayerNorm(nn.Module):
method __init__ (line 28) | def __init__(
method reset_parameters (line 53) | def reset_parameters(self) -> None:
method forward (line 58) | def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
method extra_repr (line 75) | def extra_repr(self) -> str:
class IdentityNorm (line 79) | class IdentityNorm(nn.Module):
method __init__ (line 80) | def __init__(
method forward (line 89) | def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
class TransformerEncoder (line 97) | class TransformerEncoder(nn.Module):
method __init__ (line 118) | def __init__(self, encoder_layer, num_layers, norm=None):
method forward (line 124) | def forward(
class TransformerEncoderLayer (line 147) | class TransformerEncoderLayer(nn.Module):
method __init__ (line 150) | def __init__(
method __setstate__ (line 207) | def __setstate__(self, state):
method forward (line 212) | def forward(
method _sa_block (line 229) | def _sa_block(
method _ff_block (line 247) | def _ff_block(self, x: Tensor) -> Tensor:
class AdaptiveLayerNorm (line 252) | class AdaptiveLayerNorm(nn.Module):
method __init__ (line 255) | def __init__(self, d_model, norm) -> None:
method forward (line 262) | def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
function _get_clones (line 280) | def _get_clones(module, N):
FILE: GPT_SoVITS/AR/text_processing/phonemizer.py
class GruutPhonemizer (line 15) | class GruutPhonemizer:
method __init__ (line 16) | def __init__(self, language: str):
method _normalize_punctuation (line 35) | def _normalize_punctuation(self, text: str) -> str:
method _convert_punctuation (line 41) | def _convert_punctuation(self, word: Word) -> str:
method phonemize (line 52) | def phonemize(self, text: str, espeak: bool = False) -> str:
method transform (line 58) | def transform(self, phonemes):
FILE: GPT_SoVITS/AR/utils/__init__.py
function str2bool (line 4) | def str2bool(str):
function get_newest_ckpt (line 8) | def get_newest_ckpt(string_list):
function check_txt_file (line 28) | def check_txt_file(file_path):
FILE: GPT_SoVITS/AR/utils/initialize.py
function initialize (line 8) | def initialize(model: torch.nn.Module, init: str):
FILE: GPT_SoVITS/AR/utils/io.py
function load_yaml_config (line 7) | def load_yaml_config(path):
function save_config_to_yaml (line 13) | def save_config_to_yaml(config, path):
function write_args (line 20) | def write_args(args, path):
FILE: GPT_SoVITS/BigVGAN/activations.py
class Snake (line 9) | class Snake(nn.Module):
method __init__ (line 26) | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha...
method forward (line 49) | def forward(self, x):
class SnakeBeta (line 63) | class SnakeBeta(nn.Module):
method __init__ (line 81) | def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha...
method forward (line 109) | def forward(self, x):
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py
class FusedAntiAliasActivation (line 14) | class FusedAntiAliasActivation(torch.autograd.Function):
method forward (line 22) | def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
method backward (line 28) | def backward(ctx, output_grads):
class Activation1d (line 33) | class Activation1d(nn.Module):
method __init__ (line 34) | def __init__(
method forward (line 52) | def forward(self, x):
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp
function PYBIND11_MODULE (line 21) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py
function load (line 17) | def load():
function _get_cuda_bare_metal_version (line 66) | def _get_cuda_bare_metal_version(cuda_dir):
function _create_build_dir (line 77) | def _create_build_dir(buildpath):
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py
class Activation1d (line 8) | class Activation1d(nn.Module):
method __init__ (line 9) | def __init__(
method forward (line 25) | def forward(self, x):
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py
function sinc (line 15) | def sinc(x: torch.Tensor):
function kaiser_sinc_filter1d (line 30) | def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return fil...
class LowPassFilter1d (line 63) | class LowPassFilter1d(nn.Module):
method __init__ (line 64) | def __init__(
method forward (line 92) | def forward(self, x):
FILE: GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py
class UpSample1d (line 10) | class UpSample1d(nn.Module):
method __init__ (line 11) | def __init__(self, ratio=2, kernel_size=None):
method forward (line 23) | def forward(self, x):
class DownSample1d (line 33) | class DownSample1d(nn.Module):
method __init__ (line 34) | def __init__(self, ratio=2, kernel_size=None):
method forward (line 45) | def forward(self, x):
FILE: GPT_SoVITS/BigVGAN/bigvgan.py
function load_hparams_from_json (line 25) | def load_hparams_from_json(path) -> AttrDict:
class AMPBlock1 (line 31) | class AMPBlock1(torch.nn.Module):
method __init__ (line 44) | def __init__(
method forward (line 122) | def forward(self, x):
method remove_weight_norm (line 133) | def remove_weight_norm(self):
class AMPBlock2 (line 140) | class AMPBlock2(torch.nn.Module):
method __init__ (line 153) | def __init__(
method forward (line 214) | def forward(self, x):
method remove_weight_norm (line 221) | def remove_weight_norm(self):
class BigVGAN (line 226) | class BigVGAN(
method __init__ (line 249) | def __init__(self, h: AttrDict, use_cuda_kernel: bool = False):
method forward (line 329) | def forward(self, x):
method remove_weight_norm (line 357) | def remove_weight_norm(self):
method _save_pretrained (line 372) | def _save_pretrained(self, save_directory: Path) -> None:
method _from_pretrained (line 383) | def _from_pretrained(
FILE: GPT_SoVITS/BigVGAN/discriminators.py
class DiscriminatorP (line 21) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 22) | def __init__(
method forward (line 86) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.T...
class MultiPeriodDiscriminator (line 108) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 109) | def __init__(self, h: AttrDict):
method forward (line 117) | def forward(
class DiscriminatorR (line 140) | class DiscriminatorR(nn.Module):
method __init__ (line 141) | def __init__(self, cfg: AttrDict, resolution: List[List[int]]):
method forward (line 199) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.T...
method spectrogram (line 214) | def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
class MultiResolutionDiscriminator (line 236) | class MultiResolutionDiscriminator(nn.Module):
method __init__ (line 237) | def __init__(self, cfg, debug=False):
method forward (line 245) | def forward(
class DiscriminatorB (line 272) | class DiscriminatorB(nn.Module):
method __init__ (line 273) | def __init__(
method spectrogram (line 311) | def spectrogram(self, x: torch.Tensor) -> List[torch.Tensor]:
method forward (line 323) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.T...
class MultiBandDiscriminator (line 346) | class MultiBandDiscriminator(nn.Module):
method __init__ (line 347) | def __init__(
method forward (line 360) | def forward(
class DiscriminatorCQT (line 386) | class DiscriminatorCQT(nn.Module):
method __init__ (line 387) | def __init__(self, cfg: AttrDict, hop_length: int, n_octaves: int, bin...
method get_2d_padding (line 488) | def get_2d_padding(
method forward (line 498) | def forward(self, x: torch.tensor) -> Tuple[torch.Tensor, List[torch.T...
class MultiScaleSubbandCQTDiscriminator (line 542) | class MultiScaleSubbandCQTDiscriminator(nn.Module):
method __init__ (line 543) | def __init__(self, cfg: AttrDict):
method forward (line 571) | def forward(
class CombinedDiscriminator (line 595) | class CombinedDiscriminator(nn.Module):
method __init__ (line 601) | def __init__(self, list_discriminator: List[nn.Module]):
method forward (line 605) | def forward(
FILE: GPT_SoVITS/BigVGAN/env.py
class AttrDict (line 8) | class AttrDict(dict):
method __init__ (line 9) | def __init__(self, *args, **kwargs):
function build_env (line 14) | def build_env(config, config_name, path):
FILE: GPT_SoVITS/BigVGAN/inference.py
function inference (line 23) | def inference(a, h):
function main (line 54) | def main():
FILE: GPT_SoVITS/BigVGAN/inference_e2e.py
function load_checkpoint (line 22) | def load_checkpoint(filepath, device):
function scan_checkpoint (line 30) | def scan_checkpoint(cp_dir, prefix):
function inference (line 38) | def inference(a, h):
function main (line 69) | def main():
FILE: GPT_SoVITS/BigVGAN/loss.py
class MultiScaleMelSpectrogramLoss (line 22) | class MultiScaleMelSpectrogramLoss(nn.Module):
method __init__ (line 51) | def __init__(
method get_window (line 96) | def get_window(
method get_mel_filters (line 104) | def get_mel_filters(sr, n_fft, n_mels, fmin, fmax):
method mel_spectrogram (line 107) | def mel_spectrogram(
method forward (line 162) | def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
function feature_loss (line 203) | def feature_loss(fmap_r: List[List[torch.Tensor]], fmap_g: List[List[tor...
function discriminator_loss (line 212) | def discriminator_loss(
function generator_loss (line 228) | def generator_loss(
FILE: GPT_SoVITS/BigVGAN/meldataset.py
function dynamic_range_compression (line 23) | def dynamic_range_compression(x, C=1, clip_val=1e-5):
function dynamic_range_decompression (line 27) | def dynamic_range_decompression(x, C=1):
function dynamic_range_compression_torch (line 31) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
function dynamic_range_decompression_torch (line 35) | def dynamic_range_decompression_torch(x, C=1):
function spectral_normalize_torch (line 39) | def spectral_normalize_torch(magnitudes):
function spectral_de_normalize_torch (line 43) | def spectral_de_normalize_torch(magnitudes):
function mel_spectrogram (line 51) | def mel_spectrogram(
function get_mel_spectrogram (line 119) | def get_mel_spectrogram(wav, h):
function get_dataset_filelist (line 142) | def get_dataset_filelist(a):
class MelDataset (line 172) | class MelDataset(torch.utils.data.Dataset):
method __init__ (line 173) | def __init__(
method __getitem__ (line 222) | def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor,...
method __len__ (line 369) | def __len__(self):
FILE: GPT_SoVITS/BigVGAN/tests/test_activation.py
function test_load_fused_kernels (line 16) | def test_load_fused_kernels():
function test_anti_alias_activation (line 24) | def test_anti_alias_activation():
FILE: GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py
function test_load_fused_kernels (line 16) | def test_load_fused_kernels():
function test_anti_alias_activation (line 24) | def test_anti_alias_activation():
FILE: GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py
function generate_soundwave (line 29) | def generate_soundwave(duration=5.0, sr=24000):
function get_mel (line 44) | def get_mel(x, h):
function load_checkpoint (line 48) | def load_checkpoint(filepath, device):
FILE: GPT_SoVITS/BigVGAN/train.py
function train (line 56) | def train(rank, a, h):
function main (line 611) | def main():
FILE: GPT_SoVITS/BigVGAN/utils0.py
function plot_spectrogram (line 16) | def plot_spectrogram(spectrogram):
function plot_spectrogram_clipped (line 27) | def plot_spectrogram_clipped(spectrogram, clip_max=2.0):
function init_weights (line 45) | def init_weights(m, mean=0.0, std=0.01):
function apply_weight_norm (line 51) | def apply_weight_norm(m):
function get_padding (line 57) | def get_padding(kernel_size, dilation=1):
function load_checkpoint (line 61) | def load_checkpoint(filepath, device):
function save_checkpoint (line 69) | def save_checkpoint(filepath, obj):
function scan_checkpoint (line 75) | def scan_checkpoint(cp_dir, prefix, renamed_file=None):
function save_audio (line 95) | def save_audio(audio, path, sr):
FILE: GPT_SoVITS/TTS_infer_pack/TTS.py
function resample (line 42) | def resample(audio_tensor, sr0, sr1, device):
function norm_spec (line 59) | def norm_spec(x):
function denorm_spec (line 63) | def denorm_spec(x):
function speed_change (line 96) | def speed_change(input_audio: np.ndarray, speed: float, sr: int):
class DictToAttrRecursive (line 117) | class DictToAttrRecursive(dict):
method __init__ (line 118) | def __init__(self, input_dict):
method __getattr__ (line 126) | def __getattr__(self, item):
method __setattr__ (line 132) | def __setattr__(self, key, value):
method __delattr__ (line 138) | def __delattr__(self, item):
class NO_PROMPT_ERROR (line 145) | class NO_PROMPT_ERROR(Exception):
function set_seed (line 194) | def set_seed(seed: int):
class TTS_Config (line 217) | class TTS_Config:
method __init__ (line 299) | def __init__(self, configs: Union[dict, str] = None):
method _load_configs (line 366) | def _load_configs(self, configs_path: str) -> dict:
method save_configs (line 377) | def save_configs(self, configs_path: str = None) -> None:
method update_configs (line 387) | def update_configs(self):
method update_version (line 399) | def update_version(self, version: str) -> None:
method __str__ (line 403) | def __str__(self):
method __repr__ (line 411) | def __repr__(self):
method __hash__ (line 414) | def __hash__(self):
method __eq__ (line 417) | def __eq__(self, other):
class TTS (line 421) | class TTS:
method __init__ (line 422) | def __init__(self, configs: Union[dict, str, TTS_Config]):
method _init_models (line 467) | def _init_models(
method init_cnhuhbert_weights (line 476) | def init_cnhuhbert_weights(self, base_path: str):
method init_bert_weights (line 484) | def init_bert_weights(self, base_path: str):
method init_vits_weights (line 493) | def init_vits_weights(self, weights_path: str):
method init_t2s_weights (line 594) | def init_t2s_weights(self, weights_path: str):
method init_vocoder (line 615) | def init_vocoder(self, version: str):
method init_sr_model (line 676) | def init_sr_model(self):
method init_sv_model (line 686) | def init_sv_model(self):
method enable_half_precision (line 691) | def enable_half_precision(self, enable: bool = True, save: bool = True):
method set_device (line 729) | def set_device(self, device: torch.device, save: bool = True):
method set_ref_audio (line 751) | def set_ref_audio(self, ref_audio_path: str):
method _set_ref_audio_path (line 762) | def _set_ref_audio_path(self, ref_audio_path):
method _set_ref_spec (line 765) | def _set_ref_spec(self, ref_audio_path):
method _get_ref_spec (line 772) | def _get_ref_spec(self, ref_audio_path):
method _set_prompt_semantic (line 809) | def _set_prompt_semantic(self, ref_wav_path: str):
method batch_sequences (line 835) | def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0...
method to_batch (line 856) | def to_batch(
method recovery_order (line 971) | def recovery_order(self, data: list, batch_index_list: list) -> list:
method stop (line 989) | def stop(
method run (line 998) | def run(self, inputs: dict):
method empty_cache (line 1531) | def empty_cache(self):
method audio_postprocess (line 1541) | def audio_postprocess(
method using_vocoder_synthesis (line 1601) | def using_vocoder_synthesis(
method using_vocoder_synthesis_batched_infer (line 1666) | def using_vocoder_synthesis_batched_infer(
method sola_algorithm (line 1781) | def sola_algorithm(
FILE: GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
function get_first (line 28) | def get_first(text: str) -> str:
function merge_short_text_in_array (line 34) | def merge_short_text_in_array(texts: str, threshold: int) -> list:
class TextPreprocessor (line 52) | class TextPreprocessor:
method __init__ (line 53) | def __init__(self, bert_model: AutoModelForMaskedLM, tokenizer: AutoTo...
method preprocess (line 59) | def preprocess(self, text: str, lang: str, text_split_method: str, ver...
method pre_seg_text (line 77) | def pre_seg_text(self, text: str, lang: str, text_split_method: str):
method segment_and_extract_feature_for_text (line 117) | def segment_and_extract_feature_for_text(
method get_phones_and_bert (line 122) | def get_phones_and_bert(self, text: str, language: str, version: str, ...
method get_bert_feature (line 191) | def get_bert_feature(self, text: str, word2ph: list) -> torch.Tensor:
method clean_text_inf (line 206) | def clean_text_inf(self, text: str, language: str, version: str = "v2"):
method get_bert_inf (line 212) | def get_bert_inf(self, phones: list, word2ph: list, norm_text: str, la...
method filter_text (line 224) | def filter_text(self, texts):
method replace_consecutive_punctuation (line 235) | def replace_consecutive_punctuation(self, text):
FILE: GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
function get_method (line 8) | def get_method(name: str) -> Callable:
function get_method_names (line 15) | def get_method_names() -> list:
function register_method (line 19) | def register_method(name):
function split_big_text (line 44) | def split_big_text(text, max_len=510):
function split (line 70) | def split(todo_text):
function cut0 (line 91) | def cut0(inp):
function cut1 (line 100) | def cut1(inp):
function cut2 (line 117) | def cut2(inp):
function cut3 (line 144) | def cut3(inp):
function cut4 (line 153) | def cut4(inp):
function cut5 (line 163) | def cut5(inp):
FILE: GPT_SoVITS/eres2net/ERes2Net.py
class ReLU (line 19) | class ReLU(nn.Hardtanh):
method __init__ (line 20) | def __init__(self, inplace=False):
method __repr__ (line 23) | def __repr__(self):
class BasicBlockERes2Net (line 28) | class BasicBlockERes2Net(nn.Module):
method __init__ (line 31) | def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
method forward (line 59) | def forward(self, x):
class BasicBlockERes2Net_diff_AFF (line 88) | class BasicBlockERes2Net_diff_AFF(nn.Module):
method __init__ (line 91) | def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
method forward (line 124) | def forward(self, x):
class ERes2Net (line 154) | class ERes2Net(nn.Module):
method __init__ (line 155) | def __init__(
method _make_layer (line 206) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 214) | def forward(self, x):
method forward3 (line 239) | def forward3(self, x):
FILE: GPT_SoVITS/eres2net/ERes2NetV2.py
class ReLU (line 19) | class ReLU(nn.Hardtanh):
method __init__ (line 20) | def __init__(self, inplace=False):
method __repr__ (line 23) | def __repr__(self):
class BasicBlockERes2NetV2 (line 28) | class BasicBlockERes2NetV2(nn.Module):
method __init__ (line 29) | def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2,...
method forward (line 58) | def forward(self, x):
class BasicBlockERes2NetV2AFF (line 87) | class BasicBlockERes2NetV2AFF(nn.Module):
method __init__ (line 88) | def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2,...
method forward (line 122) | def forward(self, x):
class ERes2NetV2 (line 152) | class ERes2NetV2(nn.Module):
method __init__ (line 153) | def __init__(
method _make_layer (line 207) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 219) | def forward(self, x):
method forward3 (line 240) | def forward3(self, x):
FILE: GPT_SoVITS/eres2net/ERes2Net_huge.py
class ReLU (line 20) | class ReLU(nn.Hardtanh):
method __init__ (line 21) | def __init__(self, inplace=False):
method __repr__ (line 24) | def __repr__(self):
class BasicBlockERes2Net (line 29) | class BasicBlockERes2Net(nn.Module):
method __init__ (line 32) | def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
method forward (line 60) | def forward(self, x):
class BasicBlockERes2Net_diff_AFF (line 89) | class BasicBlockERes2Net_diff_AFF(nn.Module):
method __init__ (line 92) | def __init__(self, in_planes, planes, stride=1, baseWidth=24, scale=3):
method forward (line 125) | def forward(self, x):
class ERes2Net (line 155) | class ERes2Net(nn.Module):
method __init__ (line 156) | def __init__(
method _make_layer (line 206) | def _make_layer(self, block, planes, num_blocks, stride):
method forward (line 214) | def forward(self, x):
method forward2 (line 240) | def forward2(self, x, if_mean):
method forward3 (line 271) | def forward3(self, x):
FILE: GPT_SoVITS/eres2net/fusion.py
class AFF (line 8) | class AFF(nn.Module):
method __init__ (line 9) | def __init__(self, channels=64, r=4):
method forward (line 21) | def forward(self, x, ds_y):
FILE: GPT_SoVITS/eres2net/kaldi.py
function _get_epsilon (line 35) | def _get_epsilon(device, dtype):
function _next_power_of_2 (line 39) | def _next_power_of_2(x: int) -> int:
function _get_strided (line 44) | def _get_strided(waveform: Tensor, window_size: int, window_shift: int, ...
function _feature_window_function (line 86) | def _feature_window_function(
function _get_log_energy (line 116) | def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor...
function _get_waveform_and_window_properties (line 125) | def _get_waveform_and_window_properties(
function _get_window (line 154) | def _get_window(
function _subtract_column_mean (line 220) | def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
function spectrogram (line 229) | def spectrogram(
function inverse_mel_scale_scalar (line 318) | def inverse_mel_scale_scalar(mel_freq: float) -> float:
function inverse_mel_scale (line 322) | def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
function mel_scale_scalar (line 326) | def mel_scale_scalar(freq: float) -> float:
function mel_scale (line 330) | def mel_scale(freq: Tensor) -> Tensor:
function vtln_warp_freq (line 334) | def vtln_warp_freq(
function vtln_warp_mel_freq (line 409) | def vtln_warp_mel_freq(
function get_mel_banks (line 436) | def get_mel_banks(
function fbank (line 519) | def fbank(
function _get_dct_matrix (line 679) | def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
function _get_lifter_coeffs (line 692) | def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
function mfcc (line 700) | def mfcc(
FILE: GPT_SoVITS/eres2net/pooling_layers.py
class TAP (line 10) | class TAP(nn.Module):
method __init__ (line 15) | def __init__(self, **kwargs):
method forward (line 18) | def forward(self, x):
class TSDP (line 25) | class TSDP(nn.Module):
method __init__ (line 30) | def __init__(self, **kwargs):
method forward (line 33) | def forward(self, x):
class TSTP (line 40) | class TSTP(nn.Module):
method __init__ (line 47) | def __init__(self, **kwargs):
method forward (line 50) | def forward(self, x):
class ASTP (line 61) | class ASTP(nn.Module):
method __init__ (line 66) | def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
method forward (line 78) | def forward(self, x):
FILE: GPT_SoVITS/export_torch_script.py
function init_sv_cn (line 42) | def init_sv_cn(device, is_half):
function load_sovits_new (line 47) | def load_sovits_new(sovits_path):
function get_raw_t2s_model (line 59) | def get_raw_t2s_model(dict_s1) -> Text2SemanticLightningModule:
function logits_to_probs (line 69) | def logits_to_probs(
function multinomial_sample_one_no_sync (line 107) | def multinomial_sample_one_no_sync(probs_sort):
function sample (line 114) | def sample(
function spectrogram_torch (line 135) | def spectrogram_torch(
class DictToAttrRecursive (line 161) | class DictToAttrRecursive(dict):
method __init__ (line 162) | def __init__(self, input_dict):
method __getattr__ (line 170) | def __getattr__(self, item):
method __setattr__ (line 176) | def __setattr__(self, key, value):
method __delattr__ (line 182) | def __delattr__(self, item):
class T2SMLP (line 190) | class T2SMLP:
method __init__ (line 191) | def __init__(self, w1, b1, w2, b2):
method forward (line 197) | def forward(self, x):
class T2SBlock (line 204) | class T2SBlock:
method __init__ (line 205) | def __init__(
method to_mask (line 238) | def to_mask(self, x: torch.Tensor, padding_mask: Optional[torch.Tensor]):
method process_prompt (line 247) | def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, pad...
method decode_next_token (line 281) | def decode_next_token(self, x: torch.Tensor, k_cache: torch.Tensor, v_...
class T2STransformer (line 316) | class T2STransformer:
method __init__ (line 317) | def __init__(self, num_blocks: int, blocks: list[T2SBlock]):
method process_prompt (line 321) | def process_prompt(self, x: torch.Tensor, attn_mask: torch.Tensor, pad...
method decode_next_token (line 330) | def decode_next_token(self, x: torch.Tensor, k_cache: list[torch.Tenso...
class VitsModel (line 336) | class VitsModel(nn.Module):
method __init__ (line 337) | def __init__(self, vits_path, version=None, is_half=True, device="cpu"):
method forward (line 372) | def forward(self, text_seq, pred_semantic, ref_audio, speed=1.0, sv_em...
class T2SModel (line 385) | class T2SModel(nn.Module):
method __init__ (line 386) | def __init__(self, raw_t2s: Text2SemanticLightningModule):
method forward (line 443) | def forward(
function build_phone_level_feature (line 554) | def build_phone_level_feature(res: Tensor, word2ph: IntTensor):
class MyBertModel (line 564) | class MyBertModel(torch.nn.Module):
method __init__ (line 565) | def __init__(self, bert_model):
method forward (line 569) | def forward(
class SSLModel (line 578) | class SSLModel(torch.nn.Module):
method __init__ (line 579) | def __init__(self):
method forward (line 583) | def forward(self, ref_audio_16k) -> torch.Tensor:
class ExportSSLModel (line 588) | class ExportSSLModel(torch.nn.Module):
method __init__ (line 589) | def __init__(self, ssl: SSLModel):
method forward (line 593) | def forward(self, ref_audio: torch.Tensor):
method resample (line 597) | def resample(self, ref_audio: torch.Tensor, src_sr: int, dst_sr: int) ...
function export_bert (line 602) | def export_bert(output_path):
function export (line 636) | def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, e...
function export_prov2 (line 709) | def export_prov2(
function parse_audio (line 828) | def parse_audio(ref_audio):
function resamplex (line 835) | def resamplex(ref_audio: torch.Tensor, src_sr: int, dst_sr: int) -> torc...
class GPT_SoVITS (line 839) | class GPT_SoVITS(nn.Module):
method __init__ (line 840) | def __init__(self, t2s: T2SModel, vits: VitsModel):
method forward (line 845) | def forward(
class ExportERes2NetV2 (line 865) | class ExportERes2NetV2(nn.Module):
method __init__ (line 866) | def __init__(self, sv_cn_model: SV):
method forward (line 878) | def forward(self, audio_16k):
class GPT_SoVITS_V2Pro (line 896) | class GPT_SoVITS_V2Pro(nn.Module):
method __init__ (line 897) | def __init__(self, t2s: T2SModel, vits: VitsModel, sv_model: ExportERe...
method forward (line 903) | def forward(
function test (line 926) | def test():
function export_symbel (line 1028) | def export_symbel(version="v2"):
function main (line 1039) | def main():
FILE: GPT_SoVITS/export_torch_script_v3v4.py
class MelSpectrgram (line 34) | class MelSpectrgram(torch.nn.Module):
method __init__ (line 35) | def __init__(
method forward (line 57) | def forward(self, y):
class ExportDitBlocks (line 86) | class ExportDitBlocks(torch.nn.Module):
method __init__ (line 87) | def __init__(self, dit: DiT):
method forward (line 94) | def forward(self, x, t, mask, rope):
class ExportDitEmbed (line 102) | class ExportDitEmbed(torch.nn.Module):
method __init__ (line 103) | def __init__(self, dit: DiT):
method forward (line 112) | def forward(
class ExportDiT (line 134) | class ExportDiT(torch.nn.Module):
method __init__ (line 135) | def __init__(self, dit: DiT):
method forward (line 144) | def forward( # x, prompt_x, x_lens, t, style,cond
class ExportCFM (line 158) | class ExportCFM(torch.nn.Module):
method __init__ (line 159) | def __init__(self, cfm: CFM):
method forward (line 163) | def forward(
function norm_spec (line 211) | def norm_spec(x):
function denorm_spec (line 217) | def denorm_spec(x):
class ExportGPTSovitsHalf (line 223) | class ExportGPTSovitsHalf(torch.nn.Module):
method __init__ (line 224) | def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3):
method forward (line 248) | def forward(
class ExportGPTSovitsV4Half (line 303) | class ExportGPTSovitsV4Half(torch.nn.Module):
method __init__ (line 304) | def __init__(self, hps, t2s_m: T2SModel, vq_model: SynthesizerTrnV3):
method forward (line 328) | def forward(
class GPTSoVITSV3 (line 383) | class GPTSoVITSV3(torch.nn.Module):
method __init__ (line 384) | def __init__(self, gpt_sovits_half, cfm, bigvgan):
method forward (line 390) | def forward(
class GPTSoVITSV4 (line 443) | class GPTSoVITSV4(torch.nn.Module):
method __init__ (line 444) | def __init__(self, gpt_sovits_half, cfm, hifigan):
method forward (line 450) | def forward(
function init_bigvgan (line 503) | def init_bigvgan():
function init_hifigan (line 520) | def init_hifigan():
class Sovits (line 545) | class Sovits:
method __init__ (line 546) | def __init__(self, vq_model: SynthesizerTrnV3, cfm: CFM, hps):
class DictToAttrRecursive (line 553) | class DictToAttrRecursive(dict):
method __init__ (line 554) | def __init__(self, input_dict):
method __getattr__ (line 562) | def __getattr__(self, item):
method __setattr__ (line 568) | def __setattr__(self, key, value):
method __delattr__ (line 574) | def __delattr__(self, item):
function get_sovits_weights (line 586) | def get_sovits_weights(sovits_path):
function export_cfm (line 642) | def export_cfm(
function export_1 (line 708) | def export_1(ref_wav_path, ref_wav_text, version="v3"):
function test_export (line 931) | def test_export(
function test_export (line 1050) | def test_export(
function export_2 (line 1125) | def export_2(version="v3"):
function test_export_gpt_sovits_v3 (line 1236) | def test_export_gpt_sovits_v3():
FILE: GPT_SoVITS/f5_tts/model/backbones/dit.py
class TextEmbedding (line 31) | class TextEmbedding(nn.Module):
method __init__ (line 32) | def __init__(self, text_dim, conv_layers=0, conv_mult=2):
method forward (line 44) | def forward(self, text: int["b nt"], seq_len, drop_text=False): # noq...
class InputEmbedding (line 70) | class InputEmbedding(nn.Module):
method __init__ (line 71) | def __init__(self, mel_dim, text_dim, out_dim):
method forward (line 76) | def forward(self, x: float["b n d"], cond: float["b n d"], text_embed:...
class DiT (line 88) | class DiT(nn.Module):
method __init__ (line 89) | def __init__(
method ckpt_wrapper (line 125) | def ckpt_wrapper(self, module):
method forward (line 133) | def forward( # x, prompt_x, x_lens, t, style,cond
FILE: GPT_SoVITS/f5_tts/model/backbones/mmdit.py
class TextEmbedding (line 30) | class TextEmbedding(nn.Module):
method __init__ (line 31) | def __init__(self, out_dim, text_num_embeds):
method forward (line 38) | def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]...
class AudioEmbedding (line 58) | class AudioEmbedding(nn.Module):
method __init__ (line 59) | def __init__(self, in_dim, out_dim):
method forward (line 64) | def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_...
class MMDiT (line 76) | class MMDiT(nn.Module):
method __init__ (line 77) | def __init__(
method forward (line 116) | def forward(
FILE: GPT_SoVITS/f5_tts/model/backbones/unett.py
class TextEmbedding (line 35) | class TextEmbedding(nn.Module):
method __init__ (line 36) | def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult...
method forward (line 50) | def forward(self, text: int["b nt"], seq_len, drop_text=False): # noq...
class InputEmbedding (line 78) | class InputEmbedding(nn.Module):
method __init__ (line 79) | def __init__(self, mel_dim, text_dim, out_dim):
method forward (line 84) | def forward(self, x: float["b n d"], cond: float["b n d"], text_embed:...
class UNetT (line 96) | class UNetT(nn.Module):
method __init__ (line 97) | def __init__(
method forward (line 164) | def forward(
FILE: GPT_SoVITS/f5_tts/model/modules.py
function get_bigvgan_mel_spectrogram (line 30) | def get_bigvgan_mel_spectrogram(
function get_vocos_mel_spectrogram (line 75) | def get_vocos_mel_spectrogram(
class MelSpec (line 104) | class MelSpec(nn.Module):
method __init__ (line 105) | def __init__(
method forward (line 130) | def forward(self, wav):
class SinusPositionEmbedding (line 149) | class SinusPositionEmbedding(nn.Module):
method __init__ (line 150) | def __init__(self, dim):
method forward (line 154) | def forward(self, x, scale=1000):
class ConvPositionEmbedding (line 167) | class ConvPositionEmbedding(nn.Module):
method __init__ (line 168) | def __init__(self, dim, kernel_size=31, groups=16):
method forward (line 178) | def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):...
function precompute_freqs_cis (line 196) | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, the...
function get_pos_embed_indices (line 210) | def get_pos_embed_indices(start, length, max_pos, scale=1.0):
class GRN (line 225) | class GRN(nn.Module):
method __init__ (line 226) | def __init__(self, dim):
method forward (line 231) | def forward(self, x):
class ConvNeXtV2Block (line 241) | class ConvNeXtV2Block(nn.Module):
method __init__ (line 242) | def __init__(
method forward (line 259) | def forward(self, x: torch.Tensor) -> torch.Tensor:
class AdaLayerNormZero (line 276) | class AdaLayerNormZero(nn.Module):
method __init__ (line 277) | def __init__(self, dim):
method forward (line 285) | def forward(self, x, emb=None):
class AdaLayerNormZero_Final (line 297) | class AdaLayerNormZero_Final(nn.Module):
method __init__ (line 298) | def __init__(self, dim):
method forward (line 306) | def forward(self, x, emb):
class FeedForward (line 317) | class FeedForward(nn.Module):
method __init__ (line 318) | def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate...
method forward (line 327) | def forward(self, x):
class Attention (line 335) | class Attention(nn.Module):
method __init__ (line 336) | def __init__(
method forward (line 378) | def forward(
class AttnProcessor (line 397) | class AttnProcessor:
method __init__ (line 398) | def __init__(self):
method __call__ (line 401) | def __call__(
class JointAttnProcessor (line 464) | class JointAttnProcessor:
method __init__ (line 465) | def __init__(self):
method __call__ (line 468) | def __call__(
class DiTBlock (line 550) | class DiTBlock(nn.Module):
method __init__ (line 551) | def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1):
method forward (line 566) | def forward(self, x, t, mask=None, rope=None): # x: noised input, t: ...
class MMDiTBlock (line 586) | class MMDiTBlock(nn.Module):
method __init__ (line 596) | def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, conte...
method forward (line 622) | def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: n...
class TimestepEmbedding (line 656) | class TimestepEmbedding(nn.Module):
method __init__ (line 657) | def __init__(self, dim, freq_embed_dim=256):
method forward (line 662) | def forward(self, timestep: float["b"]): # noqa: F821
FILE: GPT_SoVITS/feature_extractor/cnhubert.py
class CNHubert (line 22) | class CNHubert(nn.Module):
method __init__ (line 23) | def __init__(self, base_path: str = None):
method forward (line 34) | def forward(self, x):
function get_model (line 71) | def get_model():
function get_content (line 93) | def get_content(hmodel, wav_16k_tensor):
FILE: GPT_SoVITS/feature_extractor/whisper_enc.py
function get_model (line 4) | def get_model():
function get_content (line 12) | def get_content(model=None, wav_16k_tensor=None):
FILE: GPT_SoVITS/inference_cli.py
function synthesize (line 11) | def synthesize(
function main (line 53) | def main():
FILE: GPT_SoVITS/inference_gui.py
class GPTSoVITSGUI (line 15) | class GPTSoVITSGUI(QMainWindow):
method __init__ (line 19) | def __init__(self):
method dragEnterEvent (line 199) | def dragEnterEvent(self, event):
method dropEvent (line 203) | def dropEvent(self, event):
method add_drag_drop_events (line 211) | def add_drag_drop_events(self, widgets):
method eventFilter (line 216) | def eventFilter(self, obj, event):
method select_GPT_model (line 224) | def select_GPT_model(self):
method select_SoVITS_model (line 229) | def select_SoVITS_model(self):
method select_ref_audio (line 234) | def select_ref_audio(self):
method upload_ref_text (line 239) | def upload_ref_text(self):
method upload_target_text (line 246) | def upload_target_text(self):
method select_output_path (line 253) | def select_output_path(self):
method update_ref_audio (line 266) | def update_ref_audio(self, file_path):
method clear_output (line 269) | def clear_output(self):
method synthesize (line 272) | def synthesize(self):
FILE: GPT_SoVITS/inference_webui.py
function set_high_priority (line 12) | def set_high_priority():
function set_seed (line 106) | def set_seed(seed):
function get_bert_feature (line 171) | def get_bert_feature(text, word2ph):
class DictToAttrRecursive (line 187) | class DictToAttrRecursive(dict):
method __init__ (line 188) | def __init__(self, input_dict):
method __getattr__ (line 196) | def __getattr__(self, item):
method __setattr__ (line 202) | def __setattr__(self, key, value):
method __delattr__ (line 208) | def __delattr__(self, item):
function change_sovits_weights (line 229) | def change_sovits_weights(sovits_path, prompt_language=None, text_langua...
function change_gpt_weights (line 376) | def change_gpt_weights(gpt_path):
function clean_hifigan_model (line 407) | def clean_hifigan_model():
function clean_bigvgan_model (line 418) | def clean_bigvgan_model():
function clean_sv_cn_model (line 429) | def clean_sv_cn_model():
function init_bigvgan (line 440) | def init_bigvgan():
function init_hifigan (line 459) | def init_hifigan():
function init_sv_cn (line 491) | def init_sv_cn():
function resample (line 509) | def resample(audio_tensor, sr0, sr1, device):
function get_spepc (line 517) | def get_spepc(hps, filename, dtype, device, is_v2pro=False):
function clean_text_inf (line 552) | def clean_text_inf(text, language, version):
function get_bert_inf (line 562) | def get_bert_inf(phones, word2ph, norm_text, language):
function get_first (line 592) | def get_first(text):
function get_phones_and_bert (line 601) | def get_phones_and_bert(text, language, version, final=False):
function norm_spec (line 676) | def norm_spec(x):
function denorm_spec (line 680) | def denorm_spec(x):
function merge_short_text_in_array (line 712) | def merge_short_text_in_array(texts, threshold):
function audio_sr (line 733) | def audio_sr(audio, sr):
function get_tts_wav (line 751) | def get_tts_wav(
function split (line 1004) | def split(todo_text):
function cut1 (line 1023) | def cut1(inp):
function cut2 (line 1038) | def cut2(inp):
function cut3 (line 1063) | def cut3(inp):
function cut4 (line 1070) | def cut4(inp):
function cut5 (line 1078) | def cut5(inp):
function custom_sort_key (line 1102) | def custom_sort_key(s):
function process_text (line 1110) | def process_text(texts):
function html_center (line 1122) | def html_center(text, label="p"):
function html_left (line 1128) | def html_left(text, label="p"):
FILE: GPT_SoVITS/inference_webui_fast.py
function set_high_priority (line 12) | def set_high_priority():
function inference (line 150) | def inference(
function custom_sort_key (line 204) | def custom_sort_key(s):
function change_sovits_weights (line 233) | def change_sovits_weights(sovits_path, prompt_language=None, text_langua...
function change_gpt_weights (line 300) | def change_gpt_weights(gpt_path):
function to_cut (line 506) | def to_cut(text_inp, how_to_cut):
FILE: GPT_SoVITS/module/attentions.py
class Encoder (line 10) | class Encoder(nn.Module):
method __init__ (line 11) | def __init__(
method forward (line 64) | def forward(self, x, x_mask, g=None):
class Decoder (line 87) | class Decoder(nn.Module):
method __init__ (line 88) | def __init__(
method forward (line 145) | def forward(self, x, x_mask, h, h_mask):
class MultiHeadAttention (line 169) | class MultiHeadAttention(nn.Module):
method __init__ (line 170) | def __init__(
method forward (line 217) | def forward(self, x, c, attn_mask=None):
method attention (line 227) | def attention(self, query, key, value, mask=None):
method _matmul_with_relative_values (line 260) | def _matmul_with_relative_values(self, x, y):
method _matmul_with_relative_keys (line 269) | def _matmul_with_relative_keys(self, x, y):
method _get_relative_embeddings (line 278) | def _get_relative_embeddings(self, relative_embeddings, length):
method _relative_position_to_absolute_position (line 294) | def _relative_position_to_absolute_position(self, x):
method _absolute_position_to_relative_position (line 311) | def _absolute_position_to_relative_position(self, x):
method _attention_bias_proximal (line 325) | def _attention_bias_proximal(self, length):
class FFN (line 337) | class FFN(nn.Module):
method __init__ (line 338) | def __init__(
method forward (line 366) | def forward(self, x, x_mask):
method _causal_padding (line 376) | def _causal_padding(self, x):
method _same_padding (line 385) | def _same_padding(self, x):
class Depthwise_Separable_Conv1D (line 399) | class Depthwise_Separable_Conv1D(nn.Module):
method __init__ (line 400) | def __init__(
method forward (line 436) | def forward(self, input):
method weight_norm (line 439) | def weight_norm(self):
method remove_weight_norm (line 443) | def remove_weight_norm(self):
class Depthwise_Separable_TransposeConv1D (line 448) | class Depthwise_Separable_TransposeConv1D(nn.Module):
method __init__ (line 449) | def __init__(
method forward (line 487) | def forward(self, input):
method weight_norm (line 490) | def weight_norm(self):
method remove_weight_norm (line 494) | def remove_weight_norm(self):
function weight_norm_modules (line 499) | def weight_norm_modules(module, name="weight", dim=0):
function remove_weight_norm_modules (line 507) | def remove_weight_norm_modules(module, name="weight"):
class FFT (line 514) | class FFT(nn.Module):
method __init__ (line 515) | def __init__(
method forward (line 571) | def forward(self, x, x_mask, g=None):
class TransformerCouplingLayer (line 598) | class TransformerCouplingLayer(nn.Module):
method __init__ (line 599) | def __init__(
method forward (line 640) | def forward(self, x, x_mask, g=None, reverse=False):
FILE: GPT_SoVITS/module/attentions_onnx.py
class LayerNorm (line 11) | class LayerNorm(nn.Module):
method __init__ (line 12) | def __init__(self, channels, eps=1e-5):
method forward (line 20) | def forward(self, x):
function fused_add_tanh_sigmoid_multiply (line 27) | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
class Encoder (line 36) | class Encoder(nn.Module):
method __init__ (line 37) | def __init__(
method forward (line 118) | def forward(self, x, x_mask):
class MultiHeadAttention (line 135) | class MultiHeadAttention(nn.Module):
method __init__ (line 136) | def __init__(
method forward (line 183) | def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None):
method attention (line 194) | def attention(self, query, key, value, mask: Optional[torch.Tensor] = ...
method _matmul_with_relative_values (line 223) | def _matmul_with_relative_values(self, x, y):
method _matmul_with_relative_keys (line 232) | def _matmul_with_relative_keys(self, x, y):
method _get_relative_embeddings (line 241) | def _get_relative_embeddings(self, relative_embeddings, length):
method _relative_position_to_absolute_position (line 257) | def _relative_position_to_absolute_position(self, x):
method _absolute_position_to_relative_position (line 274) | def _absolute_position_to_relative_position(self, x):
method _attention_bias_proximal (line 288) | def _attention_bias_proximal(self, length):
class FFN (line 300) | class FFN(nn.Module):
method __init__ (line 301) | def __init__(
method forward (line 330) | def forward(self, x, x_mask):
method padding (line 340) | def padding(self, x):
method _causal_padding (line 343) | def _causal_padding(self, x):
method _same_padding (line 352) | def _same_padding(self, x):
class MRTE (line 362) | class MRTE(nn.Module):
method __init__ (line 363) | def __init__(
method forward (line 378) | def forward(self, ssl_enc, ssl_mask, text, text_mask, ge):
FILE: GPT_SoVITS/module/commons.py
function init_weights (line 6) | def init_weights(m, mean=0.0, std=0.01):
function get_padding (line 12) | def get_padding(kernel_size, dilation=1):
function intersperse (line 22) | def intersperse(lst, item):
function kl_divergence (line 28) | def kl_divergence(m_p, logs_p, m_q, logs_q):
function rand_gumbel (line 35) | def rand_gumbel(shape):
function rand_gumbel_like (line 41) | def rand_gumbel_like(x):
function slice_segments (line 46) | def slice_segments(x, ids_str, segment_size=4):
function rand_slice_segments (line 55) | def rand_slice_segments(x, x_lengths=None, segment_size=4):
function get_timing_signal_1d (line 65) | def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timesc...
function add_timing_signal_1d (line 79) | def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
function cat_timing_signal_1d (line 85) | def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis...
function subsequent_mask (line 91) | def subsequent_mask(length):
function fused_add_tanh_sigmoid_multiply (line 97) | def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
function convert_pad_shape (line 106) | def convert_pad_shape(pad_shape):
function shift_1d (line 112) | def shift_1d(x):
function sequence_mask (line 117) | def sequence_mask(length, max_length=None):
function generate_path (line 124) | def generate_path(duration, mask):
function clip_grad_value_ (line 142) | def clip_grad_value_(parameters, clip_value, norm_type=2):
function squeeze (line 160) | def squeeze(x, x_mask=None, n_sqz=2):
function unsqueeze (line 175) | def unsqueeze(x, x_mask=None, n_sqz=2):
FILE: GPT_SoVITS/module/core_vq.py
function default (line 47) | def default(val: tp.Any, d: tp.Any) -> tp.Any:
function ema_inplace (line 51) | def ema_inplace(moving_avg, new, decay: float):
function laplace_smoothing (line 55) | def laplace_smoothing(x, n_categories: int, epsilon: float = 1e-5):
function uniform_init (line 59) | def uniform_init(*shape: int):
function sample_vectors (line 65) | def sample_vectors(samples, num: int):
function kmeans (line 76) | def kmeans(samples, num_clusters: int, num_iters: int = 10, frames_to_us...
class EuclideanCodebook (line 114) | class EuclideanCodebook(nn.Module):
method __init__ (line 130) | def __init__(
method init_embed_ (line 157) | def init_embed_(self, data):
method replace_ (line 180) | def replace_(self, samples, mask):
method expire_codes_ (line 184) | def expire_codes_(self, batch_samples):
method preprocess (line 204) | def preprocess(self, x):
method quantize (line 208) | def quantize(self, x):
method postprocess_emb (line 214) | def postprocess_emb(self, embed_ind, shape):
method dequantize (line 217) | def dequantize(self, embed_ind):
method encode (line 221) | def encode(self, x):
method decode (line 231) | def decode(self, embed_ind):
method forward (line 235) | def forward(self, x):
class VectorQuantization (line 271) | class VectorQuantization(nn.Module):
method __init__ (line 288) | def __init__(
method codebook (line 322) | def codebook(self):
method encode (line 325) | def encode(self, x):
method decode (line 331) | def decode(self, embed_ind):
method forward (line 337) | def forward(self, x):
class ResidualVectorQuantization (line 359) | class ResidualVectorQuantization(nn.Module):
method __init__ (line 364) | def __init__(self, *, num_quantizers, **kwargs):
method forward (line 368) | def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional...
method encode (line 391) | def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp...
method decode (line 404) | def decode(self, q_indices: torch.Tensor, st: int = 0) -> torch.Tensor:
FILE: GPT_SoVITS/module/data_utils.py
class TextAudioSpeakerLoader (line 17) | class TextAudioSpeakerLoader(torch.utils.data.Dataset):
method __init__ (line 24) | def __init__(self, hparams, version=None, val=False):
method get_audio_text_speaker_pair (line 109) | def get_audio_text_speaker_pair(self, audiopath_sid_text):
method get_audio (line 136) | def get_audio(self, filename):
method get_sid (line 147) | def get_sid(self, sid):
method __getitem__ (line 151) | def __getitem__(self, index):
method __len__ (line 155) | def __len__(self):
method random_slice (line 158) | def random_slice(self, ssl, wav, mel):
class TextAudioSpeakerCollate (line 192) | class TextAudioSpeakerCollate:
method __init__ (line 195) | def __init__(self, return_ids=False, version=None):
method __call__ (line 199) | def __call__(self, batch):
class TextAudioSpeakerLoaderV3 (line 279) | class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
method __init__ (line 286) | def __init__(self, hparams, val=False):
method norm_spec (line 372) | def norm_spec(self, x):
method get_audio_text_speaker_pair (line 375) | def get_audio_text_speaker_pair(self, audiopath_sid_text):
method get_audio (line 396) | def get_audio(self, filename):
method get_sid (line 429) | def get_sid(self, sid):
method __getitem__ (line 433) | def __getitem__(self, index):
method __len__ (line 437) | def __len__(self):
class TextAudioSpeakerCollateV3 (line 441) | class TextAudioSpeakerCollateV3:
method __init__ (line 444) | def __init__(self, return_ids=False):
method __call__ (line 447) | def __call__(self, batch):
class TextAudioSpeakerLoaderV4 (line 517) | class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
method __init__ (line 524) | def __init__(self, hparams, val=False):
method norm_spec (line 610) | def norm_spec(self, x):
method get_audio_text_speaker_pair (line 613) | def get_audio_text_speaker_pair(self, audiopath_sid_text):
method get_audio (line 634) | def get_audio(self, filename):
method get_sid (line 648) | def get_sid(self, sid):
method __getitem__ (line 652) | def __getitem__(self, index):
method __len__ (line 656) | def __len__(self):
class TextAudioSpeakerCollateV4 (line 660) | class TextAudioSpeakerCollateV4:
method __init__ (line 663) | def __init__(self, return_ids=False):
method __call__ (line 666) | def __call__(self, batch):
class TextAudioSpeakerLoaderV3b (line 728) | class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
method __init__ (line 735) | def __init__(self, hparams, val=False):
method norm_spec (line 821) | def norm_spec(self, x):
method get_audio_text_speaker_pair (line 824) | def get_audio_text_speaker_pair(self, audiopath_sid_text):
method get_audio (line 845) | def get_audio(self, filename):
method get_sid (line 878) | def get_sid(self, sid):
method __getitem__ (line 882) | def __getitem__(self, index):
method __len__ (line 886) | def __len__(self):
class TextAudioSpeakerCollateV3b (line 890) | class TextAudioSpeakerCollateV3b:
method __init__ (line 893) | def __init__(self, return_ids=False):
method __call__ (line 896) | def __call__(self, batch):
class DistributedBucketSampler (line 976) | class DistributedBucketSampler(torch.utils.data.distributed.DistributedS...
method __init__ (line 986) | def __init__(self, dataset, batch_size, boundaries, num_replicas=None,...
method _create_buckets (line 996) | def _create_buckets(self):
method __iter__ (line 1019) | def __iter__(self):
method _bisect (line 1055) | def _bisect(self, x, lo=0, hi=None):
method __len__ (line 1070) | def __len__(self):
FILE: GPT_SoVITS/module/ddp_utils.py
class SyncFunction (line 8) | class SyncFunction(torch.autograd.Function):
method forward (line 11) | def forward(ctx, tensor):
method backward (line 44) | def backward(ctx, grad_output):
class DDP (line 53) | class DDP(DistributedDataParallel):
method forward (line 58) | def forward(self, *inputs, **kwargs): # pragma: no cover
FILE: GPT_SoVITS/module/distrib.py
function rank (line 14) | def rank():
function world_size (line 21) | def world_size():
function is_distributed (line 28) | def is_distributed():
function all_reduce (line 32) | def all_reduce(tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM):
function _is_complex_or_float (line 37) | def _is_complex_or_float(tensor):
function _check_number_of_params (line 41) | def _check_number_of_params(params: tp.List[torch.Tensor]):
function broadcast_tensors (line 57) | def broadcast_tensors(tensors: tp.Iterable[torch.Tensor], src: int = 0):
function sync_buffer (line 73) | def sync_buffer(buffers, average=True):
function sync_grad (line 93) | def sync_grad(params):
function average_metrics (line 111) | def average_metrics(metrics: tp.Dict[str, float], count=1.0):
FILE: GPT_SoVITS/module/losses.py
function feature_loss (line 6) | def feature_loss(fmap_r, fmap_g):
function discriminator_loss (line 17) | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
function generator_loss (line 33) | def generator_loss(disc_outputs):
function kl_loss (line 45) | def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
function mle_loss (line 63) | def mle_loss(z, m, logs, logdet, mask):
FILE: GPT_SoVITS/module/mel_processing.py
function dynamic_range_compression_torch (line 8) | def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
function dynamic_range_decompression_torch (line 17) | def dynamic_range_decompression_torch(x, C=1):
function spectral_normalize_torch (line 26) | def spectral_normalize_torch(magnitudes):
function spectral_de_normalize_torch (line 31) | def spectral_de_normalize_torch(magnitudes):
function spectrogram_torch (line 40) | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, cente...
function spec_to_mel_torch (line 77) | def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
function mel_spectrogram_torch (line 93) | def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, w...
FILE: GPT_SoVITS/module/models.py
class StochasticDurationPredictor (line 28) | class StochasticDurationPredictor(nn.Module):
method __init__ (line 29) | def __init__(
method forward (line 69) | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scal...
class DurationPredictor (line 117) | class DurationPredictor(nn.Module):
method __init__ (line 118) | def __init__(self, in_channels, filter_channels, kernel_size, p_dropou...
method forward (line 137) | def forward(self, x, x_mask, g=None):
class TextEncoder (line 156) | class TextEncoder(nn.Module):
method __init__ (line 157) | def __init__(
method forward (line 214) | def forward(self, y, y_lengths, text, text_lengths, ge, speed=1, test=...
method extract_latent (line 266) | def extract_latent(self, x):
method decode_latent (line 271) | def decode_latent(self, codes, y_mask, refer, refer_mask, ge):
class ResidualCouplingBlock (line 286) | class ResidualCouplingBlock(nn.Module):
method __init__ (line 287) | def __init__(
method forward (line 321) | def forward(self, x, x_mask, g=None, reverse=False):
class PosteriorEncoder (line 331) | class PosteriorEncoder(nn.Module):
method __init__ (line 332) | def __init__(
method forward (line 361) | def forward(self, x, x_lengths, g=None):
class Encoder (line 373) | class Encoder(nn.Module):
method __init__ (line 374) | def __init__(
method forward (line 390) | def forward(self, x, x_lengths, g=None):
class WNEncoder (line 400) | class WNEncoder(nn.Module):
method __init__ (line 401) | def __init__(
method forward (line 431) | def forward(self, x, x_lengths, g=None):
class Generator (line 440) | class Generator(torch.nn.Module):
method __init__ (line 441) | def __init__(
method forward (line 485) | def forward(self, x, g=None):
method remove_weight_norm (line 506) | def remove_weight_norm(self):
class DiscriminatorP (line 514) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 515) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
method forward (line 571) | def forward(self, x):
class DiscriminatorS (line 593) | class DiscriminatorS(torch.nn.Module):
method __init__ (line 594) | def __init__(self, use_spectral_norm=False):
method forward (line 609) | def forward(self, x):
class MultiPeriodDiscriminator (line 626) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 627) | def __init__(self, use_spectral_norm=False, version=None):
method forward (line 638) | def forward(self, y, y_hat):
class ReferenceEncoder (line 654) | class ReferenceEncoder(nn.Module):
method __init__ (line 660) | def __init__(self, spec_channels, gin_channels=0):
method forward (line 689) | def forward(self, inputs):
method calculate_channels (line 707) | def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
class Quantizer_module (line 713) | class Quantizer_module(torch.nn.Module):
method __init__ (line 714) | def __init__(self, n_e, e_dim):
method forward (line 719) | def forward(self, x):
class Quantizer (line 730) | class Quantizer(torch.nn.Module):
method __init__ (line 731) | def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160):
method forward (line 740) | def forward(self, xin):
method embed (line 759) | def embed(self, x):
class CodePredictor (line 771) | class CodePredictor(nn.Module):
method __init__ (line 772) | def __init__(
method forward (line 801) | def forward(self, x, x_mask, refer, codes, infer=False):
class SynthesizerTrn (line 829) | class SynthesizerTrn(nn.Module):
method __init__ (line 834) | def __init__(
method forward (line 934) | def forward(self, ssl, y, y_lengths, text, text_lengths, sv_emb=None):
method infer (line 973) | def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, nois...
method decode (line 995) | def decode(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb=...
method decode_streaming (line 1043) | def decode_streaming(self, codes, text, refer, noise_scale=0.5, speed=...
method extract_latent (line 1094) | def extract_latent(self, x):
class CFM (line 1100) | class CFM(torch.nn.Module):
method __init__ (line 1101) | def __init__(self, in_channels, dit):
method inference (line 1114) | def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, ...
method forward (line 1174) | def forward(self, x1, x_lens, prompt_lens, mu, use_grad_ckpt):
function set_no_grad (line 1210) | def set_no_grad(net_g):
class SynthesizerTrnV3 (line 1215) | class SynthesizerTrnV3(nn.Module):
method __init__ (line 1220) | def __init__(
method forward (line 1301) | def forward(
method decode_encp (line 1332) | def decode_encp(self, codes, text, refer, ge=None, speed=1):
method extract_latent (line 1357) | def extract_latent(self, x):
class SynthesizerTrnV3b (line 1363) | class SynthesizerTrnV3b(nn.Module):
method __init__ (line 1368) | def __init__(
method forward (line 1453) | def forward(self, ssl, y, mel, ssl_lengths, y_lengths, text, text_leng...
method decode_encp (line 1496) | def decode_encp(self, codes, text, refer, ge=None):
method extract_latent (line 1517) | def extract_latent(self, x):
FILE: GPT_SoVITS/module/models_onnx.py
class StochasticDurationPredictor (line 23) | class StochasticDurationPredictor(nn.Module):
method __init__ (line 24) | def __init__(
method forward (line 64) | def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scal...
class DurationPredictor (line 112) | class DurationPredictor(nn.Module):
method __init__ (line 113) | def __init__(self, in_channels, filter_channels, kernel_size, p_dropou...
method forward (line 132) | def forward(self, x, x_mask, g=None):
class TextEncoder (line 149) | class TextEncoder(nn.Module):
method __init__ (line 150) | def __init__(
method forward (line 207) | def forward(self, y, text, ge, speed=1):
class ResidualCouplingBlock (line 229) | class ResidualCouplingBlock(nn.Module):
method __init__ (line 230) | def __init__(
method forward (line 264) | def forward(self, x, x_mask, g=None, reverse=False):
class PosteriorEncoder (line 274) | class PosteriorEncoder(nn.Module):
method __init__ (line 275) | def __init__(
method forward (line 304) | def forward(self, x, x_lengths, g=None):
class Encoder (line 316) | class Encoder(nn.Module):
method __init__ (line 317) | def __init__(
method forward (line 333) | def forward(self, x, x_lengths, g=None):
class WNEncoder (line 343) | class WNEncoder(nn.Module):
method __init__ (line 344) | def __init__(
method forward (line 374) | def forward(self, x, x_lengths, g=None):
class Generator (line 383) | class Generator(torch.nn.Module):
method __init__ (line 384) | def __init__(
method forward (line 428) | def forward(self, x, g: Optional[torch.Tensor] = None):
method remove_weight_norm (line 449) | def remove_weight_norm(self):
class DiscriminatorP (line 457) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 458) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
method forward (line 514) | def forward(self, x):
class DiscriminatorS (line 536) | class DiscriminatorS(torch.nn.Module):
method __init__ (line 537) | def __init__(self, use_spectral_norm=False):
method forward (line 552) | def forward(self, x):
class MultiPeriodDiscriminator (line 566) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 567) | def __init__(self, use_spectral_norm=False):
method forward (line 575) | def forward(self, y, y_hat):
class ReferenceEncoder (line 591) | class ReferenceEncoder(nn.Module):
method __init__ (line 597) | def __init__(self, spec_channels, gin_channels=0):
method forward (line 626) | def forward(self, inputs):
method calculate_channels (line 644) | def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
class Quantizer_module (line 650) | class Quantizer_module(torch.nn.Module):
method __init__ (line 651) | def __init__(self, n_e, e_dim):
method forward (line 656) | def forward(self, x):
class Quantizer (line 667) | class Quantizer(torch.nn.Module):
method __init__ (line 668) | def __init__(self, embed_dim=512, n_code_groups=4, n_codes=160):
method forward (line 677) | def forward(self, xin):
method embed (line 696) | def embed(self, x):
class CodePredictor (line 708) | class CodePredictor(nn.Module):
method __init__ (line 709) | def __init__(
method forward (line 738) | def forward(self, x, x_mask, refer, codes, infer=False):
class SynthesizerTrn (line 769) | class SynthesizerTrn(nn.Module):
method __init__ (line 774) | def __init__(
method forward (line 879) | def forward(self, codes, text, refer, noise_scale=0.5, speed=1, sv_emb...
method extract_latent (line 908) | def extract_latent(self, x):
class CFM (line 914) | class CFM(torch.nn.Module):
method __init__ (line 915) | def __init__(self, in_channels, dit):
method forward (line 925) | def forward(
function set_no_grad (line 962) | def set_no_grad(net_g):
function compile_codes_length (line 968) | def compile_codes_length(codes):
function compile_ref_length (line 974) | def compile_ref_length(refer):
class SynthesizerTrnV3 (line 979) | class SynthesizerTrnV3(nn.Module):
method __init__ (line 984) | def __init__(
method create_ge (line 1065) | def create_ge(self, refer):
method forward (line 1071) | def forward(self, codes, text, ge, speed=1):
method extract_latent (line 1084) | def extract_latent(self, x):
FILE: GPT_SoVITS/module/modules.py
class LayerNorm (line 20) | class LayerNorm(nn.Module):
method __init__ (line 21) | def __init__(self, channels, eps=1e-5):
method forward (line 29) | def forward(self, x):
class ConvReluNorm (line 35) | class ConvReluNorm(nn.Module):
method __init__ (line 36) | def __init__(
method forward (line 73) | def forward(self, x, x_mask):
class DDSConv (line 83) | class DDSConv(nn.Module):
method __init__ (line 88) | def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
method forward (line 117) | def forward(self, x, x_mask, g=None):
class WN (line 132) | class WN(torch.nn.Module):
method __init__ (line 133) | def __init__(
method forward (line 182) | def forward(self, x, x_mask, g=None, **kwargs):
method remove_weight_norm (line 209) | def remove_weight_norm(self):
class ResBlock1 (line 218) | class ResBlock1(torch.nn.Module):
method __init__ (line 219) | def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
method forward (line 293) | def forward(self, x, x_mask=None):
method remove_weight_norm (line 308) | def remove_weight_norm(self):
class ResBlock2 (line 315) | class ResBlock2(torch.nn.Module):
method __init__ (line 316) | def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
method forward (line 344) | def forward(self, x, x_mask=None):
method remove_weight_norm (line 355) | def remove_weight_norm(self):
class Log (line 360) | class Log(nn.Module):
method forward (line 361) | def forward(self, x, x_mask, reverse=False, **kwargs):
class Flip (line 371) | class Flip(nn.Module):
method forward (line 372) | def forward(self, x, *args, reverse=False, **kwargs):
class ElementwiseAffine (line 381) | class ElementwiseAffine(nn.Module):
method __init__ (line 382) | def __init__(self, channels):
method forward (line 388) | def forward(self, x, x_mask, reverse=False, **kwargs):
class ResidualCouplingLayer (line 399) | class ResidualCouplingLayer(nn.Module):
method __init__ (line 400) | def __init__(
method forward (line 434) | def forward(self, x, x_mask, g=None, reverse=False):
class ConvFlow (line 456) | class ConvFlow(nn.Module):
method __init__ (line 457) | def __init__(
method forward (line 481) | def forward(self, x, x_mask, g=None, reverse=False):
class LinearNorm (line 512) | class LinearNorm(nn.Module):
method __init__ (line 513) | def __init__(
method forward (line 526) | def forward(self, input):
class Mish (line 531) | class Mish(nn.Module):
method __init__ (line 532) | def __init__(self):
method forward (line 535) | def forward(self, x):
class Conv1dGLU (line 539) | class Conv1dGLU(nn.Module):
method __init__ (line 545) | def __init__(self, in_channels, out_channels, kernel_size, dropout):
method forward (line 551) | def forward(self, x):
class ConvNorm (line 560) | class ConvNorm(nn.Module):
method __init__ (line 561) | def __init__(
method forward (line 591) | def forward(self, input):
class MultiHeadAttention (line 596) | class MultiHeadAttention(nn.Module):
method __init__ (line 599) | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.0, spectral_no...
method forward (line 621) | def forward(self, x, mask=None):
class ScaledDotProductAttention (line 649) | class ScaledDotProductAttention(nn.Module):
method __init__ (line 652) | def __init__(self, temperature, dropout):
method forward (line 658) | def forward(self, q, k, v, mask=None):
class MelStyleEncoder (line 672) | class MelStyleEncoder(nn.Module):
method __init__ (line 675) | def __init__(
method temporal_avg_pool (line 716) | def temporal_avg_pool(self, x, mask=None):
method forward (line 728) | def forward(self, x, mask=None):
class MelStyleEncoderVAE (line 752) | class MelStyleEncoderVAE(nn.Module):
method __init__ (line 753) | def __init__(self, spec_channels, z_latent_dim, emb_dim):
method reparameterize (line 761) | def reparameterize(self, mu, logvar):
method forward (line 769) | def forward(self, inputs, mask=None):
method infer (line 782) | def infer(self, inputs=None, random_sample=False, manual_latent=None):
class ActNorm (line 801) | class ActNorm(nn.Module):
method __init__ (line 802) | def __init__(self, channels, ddi=False, **kwargs):
method forward (line 810) | def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs):
method store_inverse (line 827) | def store_inverse(self):
method set_ddi (line 830) | def set_ddi(self, ddi):
method initialize (line 833) | def initialize(self, x, x_mask):
class InvConvNear (line 848) | class InvConvNear(nn.Module):
method __init__ (line 849) | def __init__(self, channels, n_split=4, no_jacobian=False, **kwargs):
method forward (line 861) | def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs):
method store_inverse (line 896) | def store_inverse(self):
FILE: GPT_SoVITS/module/mrte_model.py
class MRTE (line 9) | class MRTE(nn.Module):
method __init__ (line 10) | def __init__(
method forward (line 25) | def forward(self, ssl_enc, ssl_mask, text, text_mask, ge, test=None):
class SpeakerEncoder (line 47) | class SpeakerEncoder(torch.nn.Module):
method __init__ (line 48) | def __init__(
method forward (line 60) | def forward(self, mels):
class MELEncoder (line 67) | class MELEncoder(nn.Module):
method __init__ (line 68) | def __init__(
method forward (line 89) | def forward(self, x):
class WN (line 97) | class WN(torch.nn.Module):
method __init__ (line 98) | def __init__(self, hidden_channels, kernel_size, dilation_rate, n_laye...
method forward (line 132) | def forward(self, x):
method remove_weight_norm (line 150) | def remove_weight_norm(self):
function fused_add_tanh_sigmoid_multiply (line 158) | def fused_add_tanh_sigmoid_multiply(input, n_channels):
FILE: GPT_SoVITS/module/quantize.py
class QuantizedResult (line 19) | class QuantizedResult:
class ResidualVectorQuantizer (line 27) | class ResidualVectorQuantizer(nn.Module):
method __init__ (line 41) | def __init__(
method forward (line 69) | def forward(
method encode (line 93) | def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp...
method decode (line 107) | def decode(self, codes: torch.Tensor, st: int = 0) -> torch.Tensor:
FILE: GPT_SoVITS/module/transforms.py
function piecewise_rational_quadratic_transform (line 12) | def piecewise_rational_quadratic_transform(
function searchsorted (line 45) | def searchsorted(bin_locations, inputs, eps=1e-6):
function unconstrained_rational_quadratic_spline (line 50) | def unconstrained_rational_quadratic_spline(
function rational_quadratic_spline (line 100) | def rational_quadratic_spline(
FILE: GPT_SoVITS/onnx_export.py
function spectrogram_torch (line 18) | def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, cente...
class DictToAttrRecursive (line 42) | class DictToAttrRecursive(dict):
method __init__ (line 43) | def __init__(self, input_dict):
method __getattr__ (line 51) | def __getattr__(self, item):
method __setattr__ (line 57) | def __setattr__(self, key, value):
method __delattr__ (line 63) | def __delattr__(self, item):
class T2SEncoder (line 70) | class T2SEncoder(nn.Module):
method __init__ (line 71) | def __init__(self, t2s, vits):
method forward (line 76) | def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
class T2SModel (line 86) | class T2SModel(nn.Module):
method __init__ (line 87) | def __init__(self, t2s_path, vits_model):
method forward (line 106) | def forward(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content):
method export (line 132) | def export(self, ref_seq, text_seq, ref_bert, text_bert, ssl_content, ...
class VitsModel (line 192) | class VitsModel(nn.Module):
method __init__ (line 193) | def __init__(self, vits_path):
method forward (line 213) | def forward(self, text_seq, pred_semantic, ref_audio):
class GptSoVits (line 225) | class GptSoVits(nn.Module):
method __init__ (line 226) | def __init__(self, vits, t2s):
method forward (line 231) | def forward(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, s...
method export (line 249) | def export(self, ref_seq, text_seq, ref_bert, text_bert, ref_audio, ss...
class SSLModel (line 268) | class SSLModel(nn.Module):
method __init__ (line 269) | def __init__(self):
method forward (line 273) | def forward(self, ref_audio_16k):
function export (line 277) | def export(vits_path, gpt_path, project_name, vits_model="v2"):
FILE: GPT_SoVITS/prepare_datasets/1-get-text.py
function my_save (line 37) | def my_save(fea, path): #####fix issue: torch.save doesn't support chin...
function get_bert_feature (line 68) | def get_bert_feature(text, word2ph):
function process (line 86) | def process(data, res):
FILE: GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
function my_save (line 45) | def my_save(fea, path): #####fix issue: torch.save doesn't support chin...
function name2go (line 78) | def name2go(wav_name, wav_path):
FILE: GPT_SoVITS/prepare_datasets/2-get-sv.py
function my_save (line 33) | def my_save(fea, path): #####fix issue: torch.save doesn't support chin...
class SV (line 58) | class SV:
method __init__ (line 59) | def __init__(self, device, is_half):
method compute_embedding3 (line 72) | def compute_embedding3(self, wav): # (1,x)#-1~1
function name2go (line 87) | def name2go(wav_name, wav_path):
FILE: GPT_SoVITS/prepare_datasets/3-get-semantic.py
function name2go (line 89) | def name2go(wav_name, lines):
FILE: GPT_SoVITS/process_ckpt.py
function my_save (line 12) | def my_save(fea, path): #####fix issue: torch.save doesn't support chin...
function my_save2 (line 30) | def my_save2(fea, path, model_version):
function savee (line 41) | def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=N...
function get_hash_from_file (line 92) | def get_hash_from_file(sovits_path):
function get_sovits_version_from_path_fast (line 100) | def get_sovits_version_from_path_fast(sovits_path):
function load_sovits_new (line 129) | def load_sovits_new(sovits_path):
FILE: GPT_SoVITS/s1_train.py
class my_model_ckpt (line 29) | class my_model_ckpt(ModelCheckpoint):
method __init__ (line 30) | def __init__(
method on_train_epoch_end (line 46) | def on_train_epoch_end(self, trainer, pl_module):
function main (line 85) | def main(args):
FILE: GPT_SoVITS/s2_train.py
function main (line 53) | def main():
function run (line 71) | def run(rank, n_gpus, hps):
function train_and_evaluate (line 318) | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scale...
function evaluate (line 582) | def evaluate(hps, generator, eval_loader, writer_eval):
FILE: GPT_SoVITS/s2_train_v3.py
function main (line 53) | def main():
function run (line 71) | def run(rank, n_gpus, hps):
function train_and_evaluate (line 275) | def train_and_evaluate(
FILE: GPT_SoVITS/s2_train_v3_lora.py
function main (line 53) | def main():
function run (line 71) | def run(rank, n_gpus, hps):
function train_and_evaluate (line 248) | def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scale...
FILE: GPT_SoVITS/stream_v2pro.py
class StreamT2SModel (line 15) | class StreamT2SModel(nn.Module):
method __init__ (line 16) | def __init__(self, t2s: T2SModel):
method pre_infer (line 21) | def pre_infer(
method decode_next_token (line 94) | def decode_next_token(
method forward (line 141) | def forward(
class StepVitsModel (line 154) | class StepVitsModel(nn.Module):
method __init__ (line 155) | def __init__(self, vits: VitsModel,sv_model:ExportERes2NetV2):
method ref_handle (line 162) | def ref_handle(self, ref_audio_32k):
method extract_latent (line 177) | def extract_latent(self, ssl_content):
method forward (line 181) | def forward(self, pred_semantic, text_seq, refer, sv_emb=None):
function find_best_audio_offset_fast (line 188) | def find_best_audio_offset_fast(reference_audio: Tensor, search_audio: T...
function test_stream (line 240) | def test_stream(
function export_prov2 (line 438) | def export_prov2(
FILE: GPT_SoVITS/sv.py
class SV (line 11) | class SV:
method __init__ (line 12) | def __init__(self, device, is_half):
method compute_embedding3 (line 24) | def compute_embedding3(self, wav):
FILE: GPT_SoVITS/text/LangSegmenter/langsegmenter.py
function full_en (line 17) | def full_en(text):
function full_cjk (line 22) | def full_cjk(text):
function split_jako (line 48) | def split_jako(tag_lang,item):
function merge_lang (line 69) | def merge_lang(lang_list, item):
class LangSegmenter (line 77) | class LangSegmenter():
method getTexts (line 90) | def getTexts(text,default_lang = ""):
FILE: GPT_SoVITS/text/__init__.py
function cleaned_text_to_sequence (line 14) | def cleaned_text_to_sequence(cleaned_text, version=None):
FILE: GPT_SoVITS/text/cantonese.py
function replace_punctuation (line 95) | def replace_punctuation(text):
function text_normalize (line 106) | def text_normalize(text):
function jyuping_to_initials_finals_tones (line 118) | def jyuping_to_initials_finals_tones(jyuping_syllables):
function get_jyutping (line 176) | def get_jyutping(text):
function get_bert_feature (line 197) | def get_bert_feature(text, word2ph):
function g2p (line 203) | def g2p(text):
FILE: GPT_SoVITS/text/chinese.py
function replace_punctuation (line 47) | def replace_punctuation(text):
function replace_punctuation_with_en (line 58) | def replace_punctuation_with_en(text):
function replace_consecutive_punctuation (line 69) | def replace_consecutive_punctuation(text):
function g2p (line 76) | def g2p(text):
function _get_initials_finals (line 83) | def _get_initials_finals(word):
function _g2p (line 94) | def _g2p(segments):
function text_normalize (line 171) | def text_normalize(text):
FILE: GPT_SoVITS/text/chinese2.py
function replace_punctuation (line 62) | def replace_punctuation(text):
function g2p (line 73) | def g2p(text):
function _get_initials_finals (line 80) | def _get_initials_finals(word):
function _merge_erhua (line 142) | def _merge_erhua(initials: list[str], finals: list[str], word: str, pos:...
function _g2p (line 180) | def _g2p(segments):
function replace_punctuation_with_en (line 298) | def replace_punctuation_with_en(text):
function replace_consecutive_punctuation (line 309) | def replace_consecutive_punctuation(text):
function text_normalize (line 316) | def text_normalize(text):
FILE: GPT_SoVITS/text/cleaner.py
function clean_text (line 21) | def clean_text(text, language, version=None):
function clean_special (line 58) | def clean_special(text, language, special_s, target_symbol, version=None):
function text_to_sequence (line 85) | def text_to_sequence(text, language, version=None):
FILE: GPT_SoVITS/text/en_normalization/expend.py
function _convert_ordinal (line 63) | def _convert_ordinal(m):
function _remove_commas (line 75) | def _remove_commas(m):
function _expand_time (line 79) | def _expand_time(m):
function _expand_measurement (line 101) | def _expand_measurement(m):
function _expand_pounds (line 117) | def _expand_pounds(m):
function _expand_dollars (line 141) | def _expand_dollars(m):
function _expand_decimal_number (line 169) | def _expand_decimal_number(m):
function _expend_fraction (line 188) | def _expend_fraction(m):
function _expand_ordinal (line 222) | def _expand_ordinal(m):
function _expand_number (line 226) | def _expand_number(m):
function replace_asmd (line 252) | def replace_asmd(match) -> str:
function replace_negative_num (line 266) | def replace_negative_num(match) -> str:
function normalize (line 281) | def normalize(text):
FILE: GPT_SoVITS/text/english.py
function replace_phs (line 111) | def replace_phs(phs):
function replace_consecutive_punctuation (line 124) | def replace_consecutive_punctuation(text):
function read_dict (line 131) | def read_dict():
function read_dict_new (line 155) | def read_dict_new():
function hot_reload_hot (line 187) | def hot_reload_hot(g2p_dict):
function cache_dict (line 205) | def cache_dict(g2p_dict, file_path):
function get_dict (line 210) | def get_dict():
function get_namedict (line 223) | def get_namedict():
function text_normalize (line 233) | def text_normalize(text):
class en_G2p (line 248) | class en_G2p(G2p):
method __init__ (line 249) | def __init__(self):
method __call__ (line 270) | def __call__(self, text):
method qryword (line 309) | def qryword(self, o_word):
function g2p (line 363) | def g2p(text):
FILE: GPT_SoVITS/text/g2pw/dataset.py
function prepare_onnx_input (line 30) | def prepare_onnx_input(
function _truncate_texts (line 97) | def _truncate_texts(window_size: int, texts: List[str], query_ids: List[...
function _truncate (line 111) | def _truncate(
function get_phoneme_labels (line 143) | def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[...
function get_char_phoneme_labels (line 153) | def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[...
FILE: GPT_SoVITS/text/g2pw/g2pw.py
class G2PWPinyin (line 19) | class G2PWPinyin(Pinyin):
method __init__ (line 20) | def __init__(
method get_seg (line 43) | def get_seg(self, **kwargs):
class Converter (line 47) | class Converter(UltimateConverter):
method __init__ (line 48) | def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five...
method convert (line 55) | def convert(self, words, style, heteronym, errors, strict, **kwargs):
method _to_pinyin (line 72) | def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs):
function _remove_dup_items (line 90) | def _remove_dup_items(lst, remove_empty=False):
function _remove_dup_and_empty (line 100) | def _remove_dup_and_empty(lst_list):
function cache_dict (line 112) | def cache_dict(polyphonic_dict, file_path):
function get_dict (line 117) | def get_dict():
function read_dict (line 128) | def read_dict():
function correct_pronunciation (line 147) | def correct_pronunciation(word, word_pinyins):
FILE: GPT_SoVITS/text/g2pw/onnx_api.py
function predict (line 33) | def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> T...
function download_and_decompress (line 58) | def download_and_decompress(model_dir: str = "G2PWModel/"):
class G2PWOnnxConverter (line 82) | class G2PWOnnxConverter:
method __init__ (line 83) | def __init__(
method _convert_bopomofo_to_pinyin (line 178) | def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
method __call__ (line 188) | def __call__(self, sentences: List[str]) -> List[List[str]]:
method _prepare_data (line 226) | def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List...
FILE: GPT_SoVITS/text/g2pw/utils.py
function wordize_and_map (line 23) | def wordize_and_map(text: str):
function tokenize_and_map (line 59) | def tokenize_and_map(tokenizer, text: str):
function _load_config (line 86) | def _load_config(config_path: os.PathLike):
function load_config (line 132) | def load_config(config_path: os.PathLike, use_default: bool = False):
FILE: GPT_SoVITS/text/japanese.py
function get_hash (line 50) | def get_hash(fp: str) -> str:
function post_replace_ph (line 119) | def post_replace_ph(ph):
function replace_consecutive_punctuation (line 138) | def replace_consecutive_punctuation(text):
function symbols_to_japanese (line 145) | def symbols_to_japanese(text):
function preprocess_jap (line 151) | def preprocess_jap(text, with_prosody=False):
function text_normalize (line 174) | def text_normalize(text):
function pyopenjtalk_g2p_prosody (line 183) | def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
function _numeric_feature_by_regex (line 260) | def _numeric_feature_by_regex(regex, s):
function g2p (line 267) | def g2p(norm_text, with_prosody=True):
FILE: GPT_SoVITS/text/korean.py
class win_G2p (line 14) | class win_G2p(G2p):
method check_mecab (line 15) | def check_mecab(self):
function fix_g2pk2_error (line 155) | def fix_g2pk2_error(text):
function latin_to_hangul (line 170) | def latin_to_hangul(text):
function divide_hangul (line 176) | def divide_hangul(text):
function hangul_number (line 183) | def hangul_number(num, sino=True):
function number_to_hangul (line 262) | def number_to_hangul(text):
function korean_to_lazy_ipa (line 280) | def korean_to_lazy_ipa(text):
function korean_to_ipa (line 292) | def korean_to_ipa(text):
function post_replace_ph (line 301) | def post_replace_ph(ph):
function g2p (line 324) | def g2p(text):
FILE: GPT_SoVITS/text/tone_sandhi.py
class ToneSandhi (line 22) | class ToneSandhi:
method __init__ (line 23) | def __init__(self):
method _neural_sandhi (line 495) | def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> Li...
method _bu_sandhi (line 539) | def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
method _yi_sandhi (line 550) | def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
method _split_word (line 573) | def _split_word(self, word: str) -> List[str]:
method _three_sandhi (line 586) | def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
method _all_tone_three (line 626) | def _all_tone_three(self, finals: List[str]) -> bool:
method _merge_bu (line 631) | def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
method _merge_yi (line 651) | def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
method _merge_continuous_three_tones (line 679) | def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) ->...
method _is_reduplication (line 704) | def _is_reduplication(self, word: str) -> bool:
method _merge_continuous_three_tones_2 (line 708) | def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) ...
method _merge_er (line 732) | def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
method _merge_reduplication (line 741) | def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tup...
method pre_merge_for_modify (line 750) | def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tup...
method modified_tone (line 769) | def modified_tone(self, word: str, pos: str, finals: List[str]) -> Lis...
FILE: GPT_SoVITS/text/zh_normalization/char_convert.py
function tranditional_to_simplified (line 30) | def tranditional_to_simplified(text: str) -> str:
function simplified_to_traditional (line 34) | def simplified_to_traditional(text: str) -> str:
FILE: GPT_SoVITS/text/zh_normalization/chronology.py
function _time_num2str (line 22) | def _time_num2str(num_string: str) -> str:
function replace_time (line 49) | def replace_time(match) -> str:
function replace_date (line 98) | def replace_date(match) -> str:
function replace_date2 (line 122) | def replace_date2(match) -> str:
FILE: GPT_SoVITS/text/zh_normalization/num.py
function replace_frac (line 40) | def replace_frac(match) -> str:
function replace_percentage (line 61) | def replace_percentage(match) -> str:
function replace_negative_num (line 81) | def replace_negative_num(match) -> str:
function replace_default_num (line 101) | def replace_default_num(match):
function replace_asmd (line 122) | def replace_asmd(match) -> str:
function replace_power (line 153) | def replace_power(match) -> str:
function replace_positive_quantifier (line 175) | def replace_positive_quantifier(match) -> str:
function replace_number (line 194) | def replace_number(match) -> str:
function replace_range (line 228) | def replace_range(match) -> str:
function replace_to_range (line 248) | def replace_to_range(match) -> str:
function replace_vrsion_num (line 260) | def replace_vrsion_num(match) -> str:
function _get_value (line 277) | def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
function verbalize_cardinal (line 293) | def verbalize_cardinal(value_string: str) -> str:
function verbalize_digit (line 309) | def verbalize_digit(value_string: str, alt_one=False) -> str:
function num2str (line 317) | def num2str(value_string: str) -> str:
FILE: GPT_SoVITS/text/zh_normalization/phonecode.py
function phone2str (line 31) | def phone2str(phone_string: str, mobile=True) -> str:
function replace_phone (line 42) | def replace_phone(match) -> str:
function replace_mobile (line 52) | def replace_mobile(match) -> str:
FILE: GPT_SoVITS/text/zh_normalization/quantifier.py
function replace_temperature (line 42) | def replace_temperature(match) -> str:
function replace_measure (line 59) | def replace_measure(sentence) -> str:
FILE: GPT_SoVITS/text/zh_normalization/text_normlization.py
class TextNormalizer (line 61) | class TextNormalizer:
method __init__ (line 62) | def __init__(self):
method _split (line 65) | def _split(self, text: str, lang="zh") -> List[str]:
method _post_replace (line 82) | def _post_replace(self, sentence: str) -> str:
method normalize_sentence (line 130) | def normalize_sentence(self, sentence: str) -> str:
method normalize (line 172) | def normalize(self, text: str) -> List[str]:
FILE: GPT_SoVITS/utils.py
function load_checkpoint (line 23) | def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimiz...
function my_save (line 67) | def my_save(fea, path): #####fix issue: torch.save doesn't support chin...
function save_checkpoint (line 75) | def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoi...
function summarize (line 93) | def summarize(
function latest_checkpoint_path (line 112) | def latest_checkpoint_path(dir_path, regex="G_*.pth"):
function plot_spectrogram_to_numpy (line 120) | def plot_spectrogram_to_numpy(spectrogram):
function plot_alignment_to_numpy (line 145) | def plot_alignment_to_numpy(alignment, info=None):
function load_wav_to_torch (line 178) | def load_wav_to_torch(full_path):
function load_filepaths_and_text (line 183) | def load_filepaths_and_text(filename, split="|"):
function get_hparams (line 189) | def get_hparams(init=True, stage=1):
function clean_checkpoints (line 236) | def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sor...
function get_hparams_from_dir (line 263) | def get_hparams_from_dir(model_dir):
function get_hparams_from_file (line 274) | def get_hparams_from_file(config_path):
function check_git_hash (line 283) | def check_git_hash(model_dir):
function get_logger (line 309) | def get_logger(model_dir, filename="train.log"):
class HParams (line 324) | class HParams:
method __init__ (line 325) | def __init__(self, **kwargs):
method keys (line 331) | def keys(self):
method items (line 334) | def items(self):
method values (line 337) | def values(self):
method __len__ (line 340) | def __len__(self):
method __getitem__ (line 343) | def __getitem__(self, key):
method __setitem__ (line 346) | def __setitem__(self, key, value):
method __contains__ (line 349) | def __contains__(self, key):
method __repr__ (line 352) | def __repr__(self):
FILE: api.py
class DefaultRefer (line 177) | class DefaultRefer:
method __init__ (line 178) | def __init__(self, path, text, language):
method is_ready (line 183) | def is_ready(self) -> bool:
function is_empty (line 187) | def is_empty(*items): # 任意一项不为空返回False
function is_full (line 194) | def is_full(*items): # 任意一项为空返回False
function clean_hifigan_model (line 204) | def clean_hifigan_model():
function clean_bigvgan_model (line 215) | def clean_bigvgan_model():
function clean_sv_cn_model (line 226) | def clean_sv_cn_model():
function init_bigvgan (line 237) | def init_bigvgan():
function init_hifigan (line 255) | def init_hifigan():
function init_sv_cn (line 285) | def init_sv_cn():
function resample (line 293) | def resample(audio_tensor, sr0, sr1, device):
function norm_spec (line 307) | def norm_spec(x):
function denorm_spec (line 311) | def denorm_spec(x):
function audio_sr (line 346) | def audio_sr(audio, sr):
class Speaker (line 359) | class Speaker:
method __init__ (line 360) | def __init__(self, name, gpt, sovits, phones=None, bert=None, prompt=N...
class Sovits (line 372) | class Sovits:
method __init__ (line 373) | def __init__(self, vq_model, hps):
function get_sovits_weights (line 381) | def get_sovits_weights(sovits_path):
class Gpt (line 467) | class Gpt:
method __init__ (line 468) | def __init__(self, max_sec, t2s_model):
function get_gpt_weights (line 477) | def get_gpt_weights(gpt_path):
function change_gpt_sovits_weights (line 494) | def change_gpt_sovits_weights(gpt_path, sovits_path):
function get_bert_feature (line 505) | def get_bert_feature(text, word2ph):
function clean_text_inf (line 522) | def clean_text_inf(text, language, version):
function get_bert_inf (line 529) | def get_bert_inf(phones, word2ph, norm_text, language):
function get_phones_and_bert (line 545) | def get_phones_and_bert(text, language, version, final=False):
class DictToAttrRecursive (line 612) | class DictToAttrRecursive(dict):
method __init__ (line 613) | def __init__(self, input_dict):
method __getattr__ (line 621) | def __getattr__(self, item):
method __setattr__ (line 627) | def __setattr__(self, key, value):
method __delattr__ (line 633) | def __delattr__(self, item):
function get_spepc (line 640) | def get_spepc(hps, filename, dtype, device, is_v2pro=False):
function pack_audio (line 670) | def pack_audio(audio_bytes, data, rate):
function pack_ogg (line 682) | def pack_ogg(audio_bytes, data, rate):
function pack_raw (line 728) | def pack_raw(audio_bytes, data, rate):
function pack_wav (line 734) | def pack_wav(audio_bytes, rate):
function pack_aac (line 746) | def pack_aac(audio_bytes, data, rate):
function read_clean_buffer (line 783) | def read_clean_buffer(audio_bytes):
function cut_text (line 791) | def cut_text(text, punc):
function only_punc (line 809) | def only_punc(text):
function get_tts_wav (line 830) | def get_tts_wav(
function handle_control (line 1071) | def handle_control(command):
function handle_change (line 1079) | def handle_change(path, text, language):
function handle (line 1100) | def handle(
function set_model (line 1302) | async def set_model(request: Request):
function set_model (line 1310) | async def set_model(
function control (line 1318) | async def control(request: Request):
function control (line 1324) | async def control(command: str = None):
function change_refer (line 1329) | async def change_refer(request: Request):
function change_refer (line 1337) | async def change_refer(refer_wav_path: str = None, prompt_text: str = No...
function tts_endpoint (line 1342) | async def tts_endpoint(request: Request):
function tts_endpoint (line 1362) | async def tts_endpoint(
FILE: api_v2.py
class TTS_Request (line 154) | class TTS_Request(BaseModel):
function pack_ogg (line 181) | def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
function pack_raw (line 227) | def pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):
function pack_wav (line 232) | def pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):
function pack_aac (line 238) | def pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):
function pack_audio (line 268) | def pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_ty...
function wave_header_chunk (line 282) | def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sampl...
function handle_control (line 297) | def handle_control(command: str):
function check_params (line 305) | def check_params(req: dict):
function tts_handle (line 345) | async def tts_handle(req: dict):
function control (line 449) | async def control(command: str = None):
function tts_get_endpoint (line 456) | async def tts_get_endpoint(
function tts_post_endpoint (line 512) | async def tts_post_endpoint(request: TTS_Request):
function set_refer_aduio (line 518) | async def set_refer_aduio(refer_audio_path: str = None):
function set_gpt_weights (line 546) | async def set_gpt_weights(weights_path: str = None):
function set_sovits_weights (line 558) | async def set_sovits_weights(weights_path: str = None):
FILE: config.py
function custom_sort_key (line 78) | def custom_sort_key(s):
function get_weights_names (line 86) | def get_weights_names():
function change_choices (line 116) | def change_choices():
function get_device_dtype_sm (line 149) | def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, fl...
class Config (line 198) | class Config:
method __init__ (line 199) | def __init__(self):
FILE: tools/AP_BWE_main/datasets1/dataset.py
function amp_pha_stft (line 9) | def amp_pha_stft(audio, n_fft, hop_size, win_size, center=True):
function amp_pha_istft (line 30) | def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):
function get_dataset_filelist (line 39) | def get_dataset_filelist(a):
class Dataset (line 49) | class Dataset(torch.utils.data.Dataset):
method __init__ (line 50) | def __init__(
method __getitem__ (line 76) | def __getitem__(self, index):
method __len__ (line 107) | def __len__(self):
FILE: tools/AP_BWE_main/models/model.py
function get_padding (line 8) | def get_padding(kernel_size, dilation=1):
function init_weights (line 12) | def init_weights(m, mean=0.0, std=0.01):
class ConvNeXtBlock (line 24) | class ConvNeXtBlock(nn.Module):
method __init__ (line 36) | def __init__(
method forward (line 56) | def forward(self, x, cond_embedding_id=None):
class APNet_BWE_Model (line 76) | class APNet_BWE_Model(torch.nn.Module):
method __init__ (line 77) | def __init__(self, h):
method _init_weights (line 117) | def _init_weights(self, m):
method forward (line 122) | def forward(self, mag_nb, pha_nb):
class DiscriminatorP (line 147) | class DiscriminatorP(torch.nn.Module):
method __init__ (line 148) | def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=...
method forward (line 163) | def forward(self, x):
class MultiPeriodDiscriminator (line 186) | class MultiPeriodDiscriminator(torch.nn.Module):
method __init__ (line 187) | def __init__(self):
method forward (line 199) | def forward(self, y, y_hat):
class MultiResolutionAmplitudeDiscriminator (line 215) | class MultiResolutionAmplitudeDiscriminator(nn.Module):
method __init__ (line 216) | def __init__(
method forward (line 226) | def forward(
class DiscriminatorAR (line 245) | class DiscriminatorAR(nn.Module):
method __init__ (line 246) | def __init__(
method forward (line 270) | def forward(
method spectrogram (line 294) | def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
class MultiResolutionPhaseDiscriminator (line 309) | class MultiResolutionPhaseDiscriminator(nn.Module):
method __init__ (line 310) | def __init__(
method forward (line 320) | def forward(
class DiscriminatorPR (line 339) | class DiscriminatorPR(nn.Module):
method __init__ (line 340) | def __init__(
method forward (line 364) | def forward(
method spectrogram (line 388) | def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
function feature_loss (line 403) | def feature_loss(fmap_r, fmap_g):
function discriminator_loss (line 412) | def discriminator_loss(disc_real_outputs, disc_generated_outputs):
function generator_loss (line 426) | def generator_loss(disc_outputs):
function phase_losses (line 437) | def phase_losses(phase_r, phase_g):
function anti_wrapping_function (line 445) | def anti_wrapping_function(x):
function stft_mag (line 449) | def stft_mag(audio, n_fft=2048, hop_length=512):
function cal_snr (line 456) | def cal_snr(pred, target):
function cal_lsd (line 461) | def cal_lsd(pred, target):
FILE: tools/asr/config.py
function get_models (line 1) | def get_models():
FILE: tools/asr/fasterwhisper_asr.py
function download_model (line 42) | def download_model(model_size: str):
function execute_asr (line 104) | def execute_asr(input_folder, output_folder, model_path, language, preci...
FILE: tools/asr/funasr_asr.py
function only_asr (line 14) | def only_asr(input_file, language):
function create_model (line 24) | def create_model(language="zh"):
function execute_asr (line 73) | def execute_asr(input_folder, output_folder, model_size, language):
FILE: tools/audio_sr.py
class AP_BWE (line 16) | class AP_BWE:
method __init__ (line 17) | def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
method to (line 36) | def to(self, *arg, **kwargs):
method __call__ (line 41) | def __call__(self, audio, orig_sampling_rate):
FILE: tools/cmd-denoise.py
function execute_denoise (line 14) | def execute_denoise(input_folder, output_folder):
FILE: tools/i18n/i18n.py
function load_language_list (line 8) | def load_language_list(language):
function scan_language_list (line 14) | def scan_language_list():
class I18nAuto (line 22) | class I18nAuto:
method __init__ (line 23) | def __init__(self, language=None):
method __call__ (line 32) | def __call__(self, key):
method __repr__ (line 35) | def __repr__(self):
FILE: tools/i18n/scan_i18n.py
function extract_i18n_strings (line 15) | def extract_i18n_strings(node):
function scan_i18n_strings (line 29) | def scan_i18n_strings():
function update_i18n_json (line 56) | def update_i18n_json(json_file, standard_keys):
FILE: tools/my_utils.py
function load_audio (line 16) | def load_audio(file, sr):
function clean_path (line 40) | def clean_path(path_str: str):
function check_for_existance (line 49) | def check_for_existance(file_list: list = None, is_train=False, is_datas...
function check_details (line 90) | def check_details(path_list=None, is_train=False, is_dataset_processing=...
function load_cudnn (line 140) | def load_cudnn():
function load_nvrtc (line 187) | def load_nvrtc():
FILE: tools/slice_audio.py
function slice (line 13) | def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, ...
FILE: tools/slicer2.py
function get_rms (line 5) | def get_rms(
class Slicer (line 38) | class Slicer:
method __init__ (line 39) | def __init__(
method _apply_slice (line 60) | def _apply_slice(self, waveform, begin, end):
method slice (line 67) | def slice(self, waveform):
function main (line 155) | def main():
FILE: tools/subfix_webui.py
function reload_data (line 38) | def reload_data(index, batch):
function b_change_index (line 50) | def b_change_index(index, batch):
function b_next_index (line 80) | def b_next_index(index, batch):
function b_previous_index (line 88) | def b_previous_index(index, batch):
function b_submit_change (line 96) | def b_submit_change(*text_list):
function b_delete_audio (line 110) | def b_delete_audio(*checkbox_list):
function b_invert_selection (line 134) | def b_invert_selection(*checkbox_list):
function get_next_path (line 139) | def get_next_path(filename):
function b_audio_split (line 149) | def b_audio_split(audio_breakpoint, *checkbox_list):
function b_merge_audio (line 178) | def b_merge_audio(interval_r, *checkbox_list):
function b_save_json (line 222) | def b_save_json():
function b_save_list (line 228) | def b_save_list():
function b_load_json (line 238) | def b_load_json():
function b_load_list (line 246) | def b_load_list():
function b_save_file (line 262) | def b_save_file():
function b_load_file (line 269) | def b_load_file():
function set_global (line 276) | def set_global(load_json, load_list, json_key_text, json_key_path, batch):
FILE: tools/uvr5/bs_roformer/attend.py
function exists (line 7) | def exists(val):
function default (line 11) | def default(v, d):
class Attend (line 15) | class Attend(nn.Module):
method __init__ (line 16) | def __init__(self, dropout=0.0, flash=False, scale=None):
method flash_attn (line 27) | def flash_attn(self, q, k, v):
method forward (line 38) | def forward(self, q, k, v):
FILE: tools/uvr5/bs_roformer/bs_roformer.py
function exists (line 23) | def exists(val):
function default (line 27) | def default(v, d):
function pack_one (line 31) | def pack_one(t, pattern):
function unpack_one (line 35) | def unpack_one(t, ps, pattern):
function l2norm (line 42) | def l2norm(t):
class RMSNorm (line 46) | class RMSNorm(Module):
method __init__ (line 47) | def __init__(self, dim):
method forward (line 52) | def forward(self, x):
class FeedForward (line 59) | class FeedForward(Module):
method __init__ (line 60) | def __init__(self, dim, mult=4, dropout=0.0):
method forward (line 72) | def forward(self, x):
class Attention (line 76) | class Attention(Module):
method __init__ (line 77) | def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embe...
method forward (line 94) | def forward(self, x):
class LinearAttention (line 112) | class LinearAttention(Module):
method __init__ (line 118) | def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False,...
method forward (line 133) | def forward(self, x):
class Transformer (line 146) | class Transformer(Module):
method __init__ (line 147) | def __init__(
method forward (line 182) | def forward(self, x):
class BandSplit (line 193) | class BandSplit(Module):
method __init__ (line 195) | def __init__(self, dim, dim_inputs: Tuple[int, ...]):
method forward (line 205) | def forward(self, x):
function MLP (line 216) | def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
class MaskEstimator (line 235) | class MaskEstimator(Module):
method __init__ (line 237) | def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expans...
method forward (line 250) | def forward(self, x):
class BSRoformer (line 330) | class BSRoformer(Module):
method __init__ (line 332) | def __init__(
method forward (line 443) | def forward(self, raw_audio, target=None, return_loss_breakdown=False):
FILE: tools/uvr5/bs_roformer/mel_band_roformer.py
function exists (line 26) | def exists(val):
function default (line 30) | def default(v, d):
function pack_one (line 34) | def pack_one(t, pattern):
function unpack_one (line 38) | def unpack_one(t, ps, pattern):
function pad_at_dim (line 42) | def pad_at_dim(t, pad, dim=-1, value=0.0):
function l2norm (line 48) | def l2norm(t):
class RMSNorm (line 55) | class RMSNorm(Module):
method __init__ (line 56) | def __init__(self, dim):
method forward (line 61) | def forward(self, x):
class FeedForward (line 68) | class FeedForward(Module):
method __init__ (line 69) | def __init__(self, dim, mult=4, dropout=0.0):
method forward (line 81) | def forward(self, x):
class Attention (line 85) | class Attention(Module):
method __init__ (line 86) | def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embe...
method forward (line 103) | def forward(self, x):
class LinearAttention (line 121) | class LinearAttention(Module):
method __init__ (line 127) | def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False,...
method forward (line 142) | def forward(self, x):
class Transformer (line 155) | class Transformer(Module):
method __init__ (line 156) | def __init__(
method forward (line 191) | def forward(self, x):
class BandSplit (line 202) | class BandSplit(Module):
method __init__ (line 204) | def __init__(self, dim, dim_inputs: Tuple[int, ...]):
method forward (line 214) | def forward(self, x):
function MLP (line 225) | def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
class MaskEstimator (line 244) | class MaskEstimator(Module):
method __init__ (line 246) | def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expans...
method forward (line 259) | def forward(self, x):
class MelBandRoformer (line 274) | class MelBandRoformer(Module):
method __init__ (line 276) | def __init__(
method forward (line 422) | def forward(self, raw_audio, target=None, return_loss_breakdown=False):
FILE: tools/uvr5/bsroformer.py
class Roformer_Loader (line 16) | class Roformer_Loader:
method get_config (line 17) | def get_config(self, config_path):
method get_default_config (line 23) | def get_default_config(self):
method get_model_from_config (line 97) | def get_model_from_config(self):
method demix_track (line 111) | def demix_track(self, model, mix, device):
method run_folder (line 199) | def run_folder(self, input, vocal_root, others_root, format):
method save_audio (line 248) | def save_audio(self, path, data, sr, format):
method __init__ (line 262) | def __init__(self, model_path, config_path, device, is_half):
method _path_audio_ (line 303) | def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=...
FILE: tools/uvr5/mdxnet.py
class ConvTDFNetTrim (line 15) | class ConvTDFNetTrim:
method __init__ (line 16) | def __init__(self, device, model_name, target_name, L, dim_f, dim_t, n...
method stft (line 35) | def stft(self, x):
method istft (line 50) | def istft(self, x, freq_pad=None):
function get_models (line 62) | def get_models(device, dim_f, dim_t, n_fft):
class Predictor (line 74) | class Predictor:
method __init__ (line 75) | def __init__(self, args):
method demix (line 91) | def demix(self, mix):
method demix_base (line 125) | def demix_base(self, mixes, margin_size):
method prediction (line 172) | def prediction(self, m, vocal_root, others_root, format):
class MDXNetDereverb (line 208) | class MDXNetDereverb:
method __init__ (line 209) | def __init__(self, chunks):
method _path_audio_ (line 222) | def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=...
FILE: tools/uvr5/vr.py
class AudioPre (line 19) | class AudioPre:
method __init__ (line 20) | def __init__(self, agg, model_path, device, is_half, tta=False):
method _path_audio_ (line 45) | def _path_audio_(self, music_file, ins_root=None, vocal_root=None, for...
class AudioPreDeEcho (line 190) | class AudioPreDeEcho:
method __init__ (line 191) | def __init__(self, agg, model_path, device, is_half, tta=False):
method _path_audio_ (line 217) | def _path_audio_(
FILE: tools/uvr5/webui.py
function html_left (line 33) | def html_left(text, label="p"):
function html_center (line 39) | def html_center(text, label="p"):
function uvr (line 45) | def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg...
FILE: webui.py
function set_default (line 104) | def set_default():
function fix_gpu_number (line 145) | def fix_gpu_number(input): # 将越界的number强制改到界内
function fix_gpu_numbers (line 154) | def fix_gpu_numbers(inputs):
function check_pretrained_is_exist (line 167) | def check_pretrained_is_exist(version):
function kill_proc_tree (line 211) | def kill_proc_tree(pid, including_parent=True):
function kill_process (line 234) | def kill_process(pid, process_name=""):
function process_info (line 244) | def process_info(process_name="", indicator=""):
function change_label (line 270) | def change_label(path_list):
function change_uvr5 (line 301) | def change_uvr5():
function change_tts_inference (line 331) | def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_...
function open_asr (line 371) | def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_la...
function close_asr (line 417) | def close_asr():
function open_denoise (line 432) | def open_denoise(denoise_inp_dir, denoise_opt_dir):
function close_denoise (line 473) | def close_denoise():
function open1Ba (line 489) | def open1Ba(
function close1Ba (line 574) | def close1Ba():
function open1Bb (line 590) | def open1Bb(
function close1Bb (line 666) | def close1Bb():
function open_slice (line 682) | def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_s...
function close_slice (line 760) | def close_slice():
function open1a (line 780) | def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained...
function close1a (line 849) | def close1a():
function open1b (line 870) | def open1b(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pr...
function close1b (line 940) | def close1b():
function open1c (line 960) | def open1c(version, inp_text, inp_wav_dir, exp_name, gpu_numbers, pretra...
function close1c (line 1026) | def close1c():
function open1abc (line 1046) | def open1abc(
function close1abc (line 1248) | def close1abc():
function switch_version (line 1264) | def switch_version(version_):
function sync (line 1301) | def sync(text):
function change_lang_choices (line 1434) | def change_lang_choices(key): # 根据选择的模型修改可选的语言
function change_size_choices (line 1437) | def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
function change_precision_choices (line 1440) | def change_precision_choices(key): # 根据选择的模型修改可选的语言
Copy disabled (too large)
Download .json
Condensed preview — 256 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (11,345K chars).
[
{
"path": ".dockerignore",
"chars": 3689,
"preview": "GPT_SoVITS/pretrained_models/*\ntools/asr/models/*\ntools/uvr5/uvr5_weights/*\n\n.git\n.DS_Store\n.vscode\n*.pyc\nenv\nruntime\n.i"
},
{
"path": ".github/build_windows_packages.ps1",
"chars": 7563,
"preview": "$ErrorActionPreference = \"Stop\"\n\nWrite-Host \"Current location: $(Get-Location)\"\n\n$cuda = $env:TORCH_CUDA\nif (-not $cuda)"
},
{
"path": ".github/workflows/build_windows_packages.yaml",
"chars": 1064,
"preview": "name: Build and Upload Windows Package\n\non:\n workflow_dispatch:\n inputs:\n date:\n description: \"Date suff"
},
{
"path": ".github/workflows/docker-publish.yaml",
"chars": 8799,
"preview": "name: Build and Publish Docker Image\n\non:\n workflow_dispatch:\n\njobs:\n generate-meta:\n runs-on: ubuntu-22.04\n out"
},
{
"path": ".gitignore",
"chars": 3677,
"preview": ".DS_Store\n.vscode\n__pycache__\n*.pyc\nenv\nruntime\n.idea\noutput\nlogs\nSoVITS_weights*/\nGPT_weights*/\nTEMP\nweight.json\nffmpeg"
},
{
"path": ".pre-commit-config.yaml",
"chars": 370,
"preview": "ci:\n autoupdate_schedule: monthly\n\nrepos:\n- repo: https://github.com/astral-sh/ruff-pre-commit\n rev: v0.11.7\n hooks:\n"
},
{
"path": "Colab-Inference.ipynb",
"chars": 5036,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"<a href=\\\"https://c"
},
{
"path": "Colab-WebUI.ipynb",
"chars": 2740,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {\n \"colab_type\": \"text\",\n \"id\": \"vie"
},
{
"path": "Docker/install_wrapper.sh",
"chars": 627,
"preview": "#!/bin/bash\n\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" &>/dev/null && pwd)\"\n\ncd \"$SCRIPT_DIR\" || exit 1\n\ncd .. |"
},
{
"path": "Docker/miniforge_install.sh",
"chars": 2333,
"preview": "#!/bin/bash\n\nset -e\n\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" &>/dev/null && pwd)\"\n\ncd \"$SCRIPT_DIR\" || exit 1\n"
},
{
"path": "Dockerfile",
"chars": 1550,
"preview": "ARG CUDA_VERSION=12.6\nARG TORCH_BASE=full\n\nFROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}\n\nLABEL maintainer=\""
},
{
"path": "GPT_SoVITS/AR/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/AR/data/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/AR/data/bucket_sampler.py",
"chars": 5665,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/bucket_sampler.py\n# referen"
},
{
"path": "GPT_SoVITS/AR/data/data_module.py",
"chars": 2898,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/data_module.py\n# reference:"
},
{
"path": "GPT_SoVITS/AR/data/dataset.py",
"chars": 11768,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/data/dataset.py\n# reference: htt"
},
{
"path": "GPT_SoVITS/AR/models/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/AR/models/t2s_lightning_module.py",
"chars": 4874,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py\n#"
},
{
"path": "GPT_SoVITS/AR/models/t2s_lightning_module_onnx.py",
"chars": 3590,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_lightning_module.py\n#"
},
{
"path": "GPT_SoVITS/AR/models/t2s_model.py",
"chars": 34376,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py\n# reference:"
},
{
"path": "GPT_SoVITS/AR/models/t2s_model_onnx.py",
"chars": 13978,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py\n# reference:"
},
{
"path": "GPT_SoVITS/AR/models/utils.py",
"chars": 10566,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/utils.py\n# reference: htt"
},
{
"path": "GPT_SoVITS/AR/modules/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/AR/modules/activation.py",
"chars": 19742,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py\nfrom typing import Optional, T"
},
{
"path": "GPT_SoVITS/AR/modules/activation_onnx.py",
"chars": 6535,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py\nfrom typing import Optional, T"
},
{
"path": "GPT_SoVITS/AR/modules/embedding.py",
"chars": 2640,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py\nimport math\n\nimport torch\nfrom "
},
{
"path": "GPT_SoVITS/AR/modules/embedding_onnx.py",
"chars": 2030,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/embedding.py\nimport math\n\nimport torch\nfrom "
},
{
"path": "GPT_SoVITS/AR/modules/lr_schedulers.py",
"chars": 2580,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/modules/lr_schedulers.py\n# refer"
},
{
"path": "GPT_SoVITS/AR/modules/optim.py",
"chars": 25611,
"preview": "# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)\n#\n# See ../LICENSE for clarification regarding multip"
},
{
"path": "GPT_SoVITS/AR/modules/patched_mha_with_cache.py",
"chars": 19107,
"preview": "from torch.nn.functional import *\r\nfrom torch.nn.functional import (\r\n _mha_shape_check,\r\n _canonical_mask,\r\n _"
},
{
"path": "GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py",
"chars": 2969,
"preview": "from torch.nn.functional import *\r\nfrom torch.nn.functional import (\r\n _canonical_mask,\r\n)\r\n\r\n\r\ndef multi_head_attent"
},
{
"path": "GPT_SoVITS/AR/modules/scaling.py",
"chars": 12566,
"preview": "# Copyright 2022 Xiaomi Corp. (authors: Daniel Povey)\n#\n# See ../../../../LICENSE for clarification regarding"
},
{
"path": "GPT_SoVITS/AR/modules/transformer.py",
"chars": 12395,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py\nimport copy\nimport numbers\nfr"
},
{
"path": "GPT_SoVITS/AR/modules/transformer_onnx.py",
"chars": 9441,
"preview": "# modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/transformer.py\nimport copy\nimport numbers\nfr"
},
{
"path": "GPT_SoVITS/AR/text_processing/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/AR/text_processing/phonemizer.py",
"chars": 2560,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/phonemizer.py\n# "
},
{
"path": "GPT_SoVITS/AR/text_processing/symbols.py",
"chars": 614,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/text_processing/symbols.py\n# ref"
},
{
"path": "GPT_SoVITS/AR/utils/__init__.py",
"chars": 953,
"preview": "import re\n\n\ndef str2bool(str):\n return True if str.lower() == \"true\" else False\n\n\ndef get_newest_ckpt(string_list):\n "
},
{
"path": "GPT_SoVITS/AR/utils/initialize.py",
"chars": 1321,
"preview": "#!/usr/bin/env python3\n\"\"\"Initialize modules for espnet2 neural networks.\"\"\"\n\nimport torch\nfrom typeguard import check_a"
},
{
"path": "GPT_SoVITS/AR/utils/io.py",
"chars": 893,
"preview": "import sys\n\nimport torch\nimport yaml\n\n\ndef load_yaml_config(path):\n with open(path) as f:\n config = yaml.full_"
},
{
"path": "GPT_SoVITS/BigVGAN/LICENSE",
"chars": 1076,
"preview": "MIT License\n\nCopyright (c) 2024 NVIDIA CORPORATION.\n\nPermission is hereby granted, free of charge, to any person obtaini"
},
{
"path": "GPT_SoVITS/BigVGAN/README.md",
"chars": 17384,
"preview": "## BigVGAN: A Universal Neural Vocoder with Large-Scale Training\n\n#### Sang-gil Lee, Wei Ping, Boris Ginsburg, Bryan Cat"
},
{
"path": "GPT_SoVITS/BigVGAN/activations.py",
"chars": 4509,
"preview": "# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.\n# LICENSE is in incl_license"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/activation1d.py",
"chars": 2431,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\nimport torch\nimport torch.nn as nn\nfrom al"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation.cpp",
"chars": 977,
"preview": "/* coding=utf-8\n * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.\n *\n * Licensed under the Apache License"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/anti_alias_activation_cuda.cu",
"chars": 10328,
"preview": "/* coding=utf-8\n * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.\n *\n * Licensed under the Apache License"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/compat.h",
"chars": 893,
"preview": "/* coding=utf-8\n * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.\n *\n * Licensed under the Apache License"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/load.py",
"chars": 2566,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\nimport os\nimport pathlib\nimport subprocess"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/cuda/type_shim.h",
"chars": 5838,
"preview": "/* coding=utf-8\n * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.\n *\n * Licensed under the Apache License"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/torch/__init__.py",
"chars": 200,
"preview": "# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0\n# LICENSE is in incl_licens"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/torch/act.py",
"chars": 825,
"preview": "# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0\n# LICENSE is in incl_licens"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/torch/filter.py",
"chars": 3395,
"preview": "# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0\n# LICENSE is in incl_licens"
},
{
"path": "GPT_SoVITS/BigVGAN/alias_free_activation/torch/resample.py",
"chars": 1715,
"preview": "# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0\n# LICENSE is in incl_licens"
},
{
"path": "GPT_SoVITS/BigVGAN/bigvgan.py",
"chars": 16888,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\n# Adapted from https://github.com/jik876/h"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_22khz_80band.json",
"chars": 1015,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 32,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ad"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_24khz_100band.json",
"chars": 1017,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 32,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ad"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_base_22khz_80band.json",
"chars": 1008,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 32,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ad"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_base_24khz_100band.json",
"chars": 1010,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 32,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ad"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_256x.json",
"chars": 1377,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 4,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ada"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_v2_22khz_80band_fmax8k_256x.json",
"chars": 1373,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 4,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ada"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_v2_24khz_100band_256x.json",
"chars": 1374,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 4,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ada"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_256x.json",
"chars": 1374,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 4,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ada"
},
{
"path": "GPT_SoVITS/BigVGAN/configs/bigvgan_v2_44khz_128band_512x.json",
"chars": 1371,
"preview": "{\n \"resblock\": \"1\",\n \"num_gpus\": 0,\n \"batch_size\": 4,\n \"learning_rate\": 0.0001,\n \"adam_b1\": 0.8,\n \"ada"
},
{
"path": "GPT_SoVITS/BigVGAN/discriminators.py",
"chars": 21095,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\n# Adapted from https://github.com/jik876/h"
},
{
"path": "GPT_SoVITS/BigVGAN/env.py",
"chars": 511,
"preview": "# Adapted from https://github.com/jik876/hifi-gan under the MIT license.\n# LICENSE is in incl_licenses directory.\n\nimp"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_1",
"chars": 1067,
"preview": "MIT License\n\nCopyright (c) 2020 Jungil Kong\n\nPermission is hereby granted, free of charge, to any person obtaining a cop"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_2",
"chars": 1068,
"preview": "MIT License\n\nCopyright (c) 2020 Edward Dixon\n\nPermission is hereby granted, free of charge, to any person obtaining a co"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_3",
"chars": 11355,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_4",
"chars": 1524,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2019, Seungwon Park 박승원\nAll rights reserved.\n\nRedistribution and use in source and b"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_5",
"chars": 1057,
"preview": "Copyright 2020 Alexandre Défossez\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this "
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_6",
"chars": 1073,
"preview": "MIT License\n\nCopyright (c) 2023-present, Descript\n\nPermission is hereby granted, free of charge, to any person obtaining"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_7",
"chars": 1069,
"preview": "MIT License\n\nCopyright (c) 2023 Charactr Inc.\n\nPermission is hereby granted, free of charge, to any person obtaining a c"
},
{
"path": "GPT_SoVITS/BigVGAN/incl_licenses/LICENSE_8",
"chars": 1063,
"preview": "MIT License\n\nCopyright (c) 2023 Amphion\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof"
},
{
"path": "GPT_SoVITS/BigVGAN/inference.py",
"chars": 2597,
"preview": "# Adapted from https://github.com/jik876/hifi-gan under the MIT license.\n# LICENSE is in incl_licenses directory.\n\nfro"
},
{
"path": "GPT_SoVITS/BigVGAN/inference_e2e.py",
"chars": 2851,
"preview": "# Adapted from https://github.com/jik876/hifi-gan under the MIT license.\n# LICENSE is in incl_licenses directory.\n\nfro"
},
{
"path": "GPT_SoVITS/BigVGAN/loss.py",
"chars": 8020,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\n# Adapted from https://github.com/jik876/h"
},
{
"path": "GPT_SoVITS/BigVGAN/meldataset.py",
"chars": 14632,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\n# Adapted from https://github.com/jik876/h"
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/.gitkeep",
"chars": 1,
"preview": "\n"
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/bias.md",
"chars": 664,
"preview": "| Field | Response "
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/explainability.md",
"chars": 4199,
"preview": "| Field | Response "
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/overview.md",
"chars": 7166,
"preview": "# Model Overview\n\n## Description:\n\nBigVGAN is a generative AI model specialized in synthesizing audio waveforms using Me"
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/privacy.md",
"chars": 2632,
"preview": "| Field "
},
{
"path": "GPT_SoVITS/BigVGAN/nv-modelcard++/safety.md",
"chars": 1584,
"preview": "| Field | Response "
},
{
"path": "GPT_SoVITS/BigVGAN/requirements.txt",
"chars": 122,
"preview": "torch\nnumpy\nlibrosa>=0.8.1\nscipy\ntensorboard\nsoundfile\nmatplotlib\npesq\nauraloss\ntqdm\nnnAudio\nninja\nhuggingface_hub>=0.23"
},
{
"path": "GPT_SoVITS/BigVGAN/tests/test_activation.py",
"chars": 1936,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\nimport os\nimport sys\n\n# to import modules "
},
{
"path": "GPT_SoVITS/BigVGAN/tests/test_activation_snake_beta.py",
"chars": 1949,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\nimport os\nimport sys\n\n# to import modules "
},
{
"path": "GPT_SoVITS/BigVGAN/tests/test_cuda_vs_torch_model.py",
"chars": 8040,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\nimport os\nimport sys\n\n# to import modules "
},
{
"path": "GPT_SoVITS/BigVGAN/train.py",
"chars": 27375,
"preview": "# Copyright (c) 2024 NVIDIA CORPORATION.\n# Licensed under the MIT license.\n\n# Adapted from https://github.com/jik876/h"
},
{
"path": "GPT_SoVITS/BigVGAN/utils0.py",
"chars": 2564,
"preview": "# Adapted from https://github.com/jik876/hifi-gan under the MIT license.\n# LICENSE is in incl_licenses directory.\n\nimp"
},
{
"path": "GPT_SoVITS/TTS_infer_pack/TTS.py",
"chars": 80233,
"preview": "import gc\nimport math\nimport os\nimport random\nimport sys\nimport time\nimport traceback\nfrom copy import deepcopy\n\nimport "
},
{
"path": "GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py",
"chars": 9023,
"preview": "import os\nimport sys\nimport threading\n\nfrom tqdm import tqdm\n\nnow_dir = os.getcwd()\nsys.path.append(now_dir)\n\nimport re\n"
},
{
"path": "GPT_SoVITS/TTS_infer_pack/__init__.py",
"chars": 44,
"preview": "from . import TTS, text_segmentation_method\n"
},
{
"path": "GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py",
"chars": 4485,
"preview": "import re\nfrom typing import Callable\n\npunctuation = set([\"!\", \"?\", \"…\", \",\", \".\", \"-\", \" \"])\nMETHODS = dict()\n\n\ndef get"
},
{
"path": "GPT_SoVITS/configs/.gitignore",
"chars": 6,
"preview": "*.yaml"
},
{
"path": "GPT_SoVITS/configs/s2.json",
"chars": 1581,
"preview": "{\n \"train\": {\n \"log_interval\": 100,\n \"eval_interval\": 500,\n \"seed\": 1234,\n \"epochs\": 100,\n \"learning_rat"
},
{
"path": "GPT_SoVITS/configs/s2v2Pro.json",
"chars": 1582,
"preview": "{\n \"train\": {\n \"log_interval\": 100,\n \"eval_interval\": 500,\n \"seed\": 1234,\n \"epochs\": 100,\n \"learning_rat"
},
{
"path": "GPT_SoVITS/configs/s2v2ProPlus.json",
"chars": 1582,
"preview": "{\n \"train\": {\n \"log_interval\": 100,\n \"eval_interval\": 500,\n \"seed\": 1234,\n \"epochs\": 100,\n \"learning_rat"
},
{
"path": "GPT_SoVITS/download.py",
"chars": 300,
"preview": "import os\nimport sys\n\nnow_dir = os.getcwd()\nsys.path.insert(0, now_dir)\nfrom text.g2pw import G2PWPinyin\n\ng2pw = G2PWPin"
},
{
"path": "GPT_SoVITS/eres2net/ERes2Net.py",
"chars": 9870,
"preview": "# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.\n# Licensed under the A"
},
{
"path": "GPT_SoVITS/eres2net/ERes2NetV2.py",
"chars": 9734,
"preview": "# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.\n# Licensed under the A"
},
{
"path": "GPT_SoVITS/eres2net/ERes2Net_huge.py",
"chars": 11082,
"preview": "# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.\n# Licensed under the A"
},
{
"path": "GPT_SoVITS/eres2net/fusion.py",
"chars": 948,
"preview": "# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.\n# Licensed under the A"
},
{
"path": "GPT_SoVITS/eres2net/kaldi.py",
"chars": 37310,
"preview": "import math\nfrom typing import Tuple\n\nimport torch\nimport torchaudio\nfrom torch import Tensor\n\n__all__ = [\n \"get_mel_"
},
{
"path": "GPT_SoVITS/eres2net/pooling_layers.py",
"chars": 3646,
"preview": "# Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.\n# Licensed under the A"
},
{
"path": "GPT_SoVITS/export_torch_script.py",
"chars": 38686,
"preview": "# modified from https://github.com/yangdongchao/SoundStorm/blob/master/soundstorm/s1/AR/models/t2s_model.py\n# reference:"
},
{
"path": "GPT_SoVITS/export_torch_script_v3v4.py",
"chars": 42095,
"preview": "import os\nfrom export_torch_script import (\n T2SModel,\n get_raw_t2s_model,\n resamplex,\n spectrogram_torch,\n)"
},
{
"path": "GPT_SoVITS/f5_tts/model/__init__.py",
"chars": 453,
"preview": "# from f5_tts.model.cfm import CFM\n#\n# from f5_tts.model.backbones.unett import UNetT\nfrom GPT_SoVITS.f5_tts.model.backb"
},
{
"path": "GPT_SoVITS/f5_tts/model/backbones/README.md",
"chars": 701,
"preview": "## Backbones quick introduction\n\n\n### unett.py\n- flat unet transformer\n- structure same as in e2-tts & voicebox paper ex"
},
{
"path": "GPT_SoVITS/f5_tts/model/backbones/dit.py",
"chars": 6210,
"preview": "\"\"\"\nein notation:\nb - batch\nn - sequence\nnt - text sequence\nnw - raw wave length\nd - dimension\n\"\"\"\n\nfrom __future__ impo"
},
{
"path": "GPT_SoVITS/f5_tts/model/backbones/mmdit.py",
"chars": 4208,
"preview": "\"\"\"\nein notation:\nb - batch\nn - sequence\nnt - text sequence\nnw - raw wave length\nd - dimension\n\"\"\"\n\nfrom __future__ impo"
},
{
"path": "GPT_SoVITS/f5_tts/model/backbones/unett.py",
"chars": 6956,
"preview": "\"\"\"\nein notation:\nb - batch\nn - sequence\nnt - text sequence\nnw - raw wave length\nd - dimension\n\"\"\"\n\nfrom __future__ impo"
},
{
"path": "GPT_SoVITS/f5_tts/model/modules.py",
"chars": 22377,
"preview": "\"\"\"\nein notation:\nb - batch\nn - sequence\nnt - text sequence\nnw - raw wave length\nd - dimension\n\"\"\"\n\nfrom __future__ impo"
},
{
"path": "GPT_SoVITS/feature_extractor/__init__.py",
"chars": 105,
"preview": "from . import cnhubert, whisper_enc\n\ncontent_module_map = {\"cnhubert\": cnhubert, \"whisper\": whisper_enc}\n"
},
{
"path": "GPT_SoVITS/feature_extractor/cnhubert.py",
"chars": 3483,
"preview": "import torch\nimport os\nfrom transformers import logging as tf_logging\n\ntf_logging.set_verbosity_error()\n\nimport logging\n"
},
{
"path": "GPT_SoVITS/feature_extractor/whisper_enc.py",
"chars": 656,
"preview": "import torch\n\n\ndef get_model():\n import whisper\n\n model = whisper.load_model(\"small\", device=\"cpu\")\n\n return mo"
},
{
"path": "GPT_SoVITS/inference_cli.py",
"chars": 2637,
"preview": "import argparse\nimport os\nimport soundfile as sf\n\nfrom tools.i18n.i18n import I18nAuto\nfrom GPT_SoVITS.inference_webui i"
},
{
"path": "GPT_SoVITS/inference_gui.py",
"chars": 11616,
"preview": "import os\nimport sys\nfrom PyQt5.QtCore import QEvent\nfrom PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLin"
},
{
"path": "GPT_SoVITS/inference_webui.py",
"chars": 47169,
"preview": "\"\"\"\n按中英混合识别\n按日英混合识别\n多语种启动切分识别语种\n全部按中文识别\n全部按英文识别\n全部按日文识别\n\"\"\"\nimport psutil\nimport os\n\ndef set_high_priority():\n \"\"\"把当前"
},
{
"path": "GPT_SoVITS/inference_webui_fast.py",
"chars": 19137,
"preview": "\"\"\"\n按中英混合识别\n按日英混合识别\n多语种启动切分识别语种\n全部按中文识别\n全部按英文识别\n全部按日文识别\n\"\"\"\nimport psutil\nimport os\n\ndef set_high_priority():\n \"\"\"把当前"
},
{
"path": "GPT_SoVITS/module/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "GPT_SoVITS/module/attentions.py",
"chars": 23085,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom module import commons\nfrom modu"
},
{
"path": "GPT_SoVITS/module/attentions_onnx.py",
"chars": 14149,
"preview": "import math\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom module import commons\n\nfrom typ"
},
{
"path": "GPT_SoVITS/module/commons.py",
"chars": 5824,
"preview": "import math\nimport torch\nfrom torch.nn import functional as F\n\n\ndef init_weights(m, mean=0.0, std=0.01):\n classname ="
},
{
"path": "GPT_SoVITS/module/core_vq.py",
"chars": 15116,
"preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the l"
},
{
"path": "GPT_SoVITS/module/data_utils.py",
"chars": 40858,
"preview": "import os\nimport random\nimport traceback\nimport torch\nimport torch.utils.data\nfrom tqdm import tqdm\n\nfrom module.mel_pro"
},
{
"path": "GPT_SoVITS/module/ddp_utils.py",
"chars": 9033,
"preview": "import torch\nfrom torch.nn.parallel import DistributedDataParallel\nfrom torch.nn.parallel.distributed import _find_tenso"
},
{
"path": "GPT_SoVITS/module/distrib.py",
"chars": 4044,
"preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the l"
},
{
"path": "GPT_SoVITS/module/losses.py",
"chars": 1827,
"preview": "import math\n\nimport torch\n\n\ndef feature_loss(fmap_r, fmap_g):\n loss = 0\n for dr, dg in zip(fmap_r, fmap_g):\n "
},
{
"path": "GPT_SoVITS/module/mel_processing.py",
"chars": 4642,
"preview": "import torch\nimport torch.utils.data\nfrom librosa.filters import mel as librosa_mel_fn\n\nMAX_WAV_VALUE = 32768.0\n\n\ndef dy"
},
{
"path": "GPT_SoVITS/module/models.py",
"chars": 56558,
"preview": "import warnings\n\nwarnings.filterwarnings(\"ignore\")\nimport math\n\nimport torch\nfrom torch import nn\nfrom torch.nn import f"
},
{
"path": "GPT_SoVITS/module/models_onnx.py",
"chars": 37794,
"preview": "import math\nfrom typing import Optional\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom mod"
},
{
"path": "GPT_SoVITS/module/modules.py",
"chars": 28816,
"preview": "import math\n\nimport numpy as np\nimport torch\nfrom torch import nn\nfrom torch.nn import functional as F\n\nfrom torch.nn im"
},
{
"path": "GPT_SoVITS/module/mrte_model.py",
"chars": 5825,
"preview": "# This is Multi-reference timbre encoder\n\nimport torch\nfrom torch import nn\nfrom torch.nn.utils import remove_weight_nor"
},
{
"path": "GPT_SoVITS/module/quantize.py",
"chars": 4456,
"preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\n#\n# This source code is licensed under the l"
},
{
"path": "GPT_SoVITS/module/transforms.py",
"chars": 7208,
"preview": "import torch\nfrom torch.nn import functional as F\n\nimport numpy as np\n\n\nDEFAULT_MIN_BIN_WIDTH = 1e-3\nDEFAULT_MIN_BIN_HEI"
},
{
"path": "GPT_SoVITS/onnx_export.py",
"chars": 14407,
"preview": "import torch\r\nimport torchaudio\r\nfrom AR.models.t2s_lightning_module_onnx import Text2SemanticLightningModule\r\nfrom feat"
},
{
"path": "GPT_SoVITS/prepare_datasets/1-get-text.py",
"chars": 5249,
"preview": "# -*- coding: utf-8 -*-\r\n\r\nimport os\r\n\r\ninp_text = os.environ.get(\"inp_text\")\r\ninp_wav_dir = os.environ.get(\"inp_wav_dir"
},
{
"path": "GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py",
"chars": 4256,
"preview": "# -*- coding: utf-8 -*-\r\n\r\nimport sys\r\nimport os\r\n\r\ninp_text = os.environ.get(\"inp_text\")\r\ninp_wav_dir = os.environ.get("
},
{
"path": "GPT_SoVITS/prepare_datasets/2-get-sv.py",
"chars": 3653,
"preview": "# -*- coding: utf-8 -*-\r\n\r\nimport sys\r\nimport os\r\n\r\ninp_text = os.environ.get(\"inp_text\")\r\ninp_wav_dir = os.environ.get("
},
{
"path": "GPT_SoVITS/prepare_datasets/3-get-semantic.py",
"chars": 3893,
"preview": "import os\r\n\r\ninp_text = os.environ.get(\"inp_text\")\r\nexp_name = os.environ.get(\"exp_name\")\r\ni_part = os.environ.get(\"i_pa"
},
{
"path": "GPT_SoVITS/pretrained_models/.gitignore",
"chars": 13,
"preview": "*\n!.gitignore"
},
{
"path": "GPT_SoVITS/process_ckpt.py",
"chars": 4193,
"preview": "import traceback\r\nfrom collections import OrderedDict\r\nfrom time import time as ttime\r\nimport shutil\r\nimport os\r\nimport "
},
{
"path": "GPT_SoVITS/s1_train.py",
"chars": 7115,
"preview": "# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/train_t2s.py\nimport os\n\nif \"_CUDA_VISIBLE_"
},
{
"path": "GPT_SoVITS/s2_train.py",
"chars": 24357,
"preview": "import warnings\n\nwarnings.filterwarnings(\"ignore\")\nimport os\n\nimport utils\n\nhps = utils.get_hparams(stage=2)\nos.environ["
},
{
"path": "GPT_SoVITS/s2_train_v3.py",
"chars": 16075,
"preview": "import warnings\n\nwarnings.filterwarnings(\"ignore\")\nimport os\n\nimport utils\n\nhps = utils.get_hparams(stage=2)\nos.environ["
},
{
"path": "GPT_SoVITS/s2_train_v3_lora.py",
"chars": 12245,
"preview": "import warnings\n\nwarnings.filterwarnings(\"ignore\")\nimport os\n\nimport utils\n\nhps = utils.get_hparams(stage=2)\nos.environ["
},
{
"path": "GPT_SoVITS/stream_v2pro.py",
"chars": 20579,
"preview": "# 这是一个实验性质的实现,旨在探索 stream infer 的可能性。(xiao hai xie zhe wan de)\nfrom typing import List\nfrom export_torch_script import E"
},
{
"path": "GPT_SoVITS/sv.py",
"chars": 1201,
"preview": "import sys\r\nimport os\r\nimport torch\r\n\r\nsys.path.append(f\"{os.getcwd()}/GPT_SoVITS/eres2net\")\r\nsv_path = \"GPT_SoVITS/pret"
},
{
"path": "GPT_SoVITS/text/.gitignore",
"chars": 27,
"preview": "G2PWModel\n__pycache__\n*.zip"
},
{
"path": "GPT_SoVITS/text/LangSegmenter/__init__.py",
"chars": 41,
"preview": "from .langsegmenter import LangSegmenter\n"
},
{
"path": "GPT_SoVITS/text/LangSegmenter/langsegmenter.py",
"chars": 8253,
"preview": "import logging\nimport re\n\n# jieba静音\nimport jieba\njieba.setLogLevel(logging.CRITICAL)\n\n# 更改fast_langdetect大模型位置\nfrom path"
},
{
"path": "GPT_SoVITS/text/__init__.py",
"chars": 917,
"preview": "import os\n# if os.environ.get(\"version\",\"v1\")==\"v1\":\n# from text.symbols import symbols\n# else:\n# from text.symbols2"
},
{
"path": "GPT_SoVITS/text/cantonese.py",
"chars": 5671,
"preview": "# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py\r\n\r\nimport re\r\n"
},
{
"path": "GPT_SoVITS/text/chinese.py",
"chars": 5757,
"preview": "import os\nimport re\n\nimport cn2an\nfrom pypinyin import lazy_pinyin, Style\n\nfrom text.symbols import punctuation\nfrom tex"
},
{
"path": "GPT_SoVITS/text/chinese2.py",
"chars": 9610,
"preview": "import os\nimport re\n\nimport cn2an\nfrom pypinyin import lazy_pinyin, Style\nfrom pypinyin.contrib.tone_convert import to_f"
},
{
"path": "GPT_SoVITS/text/cleaner.py",
"chars": 3268,
"preview": "from text import cleaned_text_to_sequence\nimport os\n# if os.environ.get(\"version\",\"v1\")==\"v1\":\n# from text import ch"
},
{
"path": "GPT_SoVITS/text/cmudict-fast.rep",
"chars": 3613898,
"preview": "'BOUT B AW1 T\n'CAUSE K AH0 Z\n'COURSE K AO1 R S\n'CUSE K Y UW1 Z\n'EM AH0 M\n'FRISCO F R IH1 S K OW0\n'GAIN G EH1 N\n'KAY K EY"
},
{
"path": "GPT_SoVITS/text/cmudict.rep",
"chars": 3731281,
"preview": ";;; # CMUdict -- Major Version: 0.07\n;;; \n;;; # $HeadURL: https://svn.code.sf.net/p/cmusphinx/code/branches/cmudict/cm"
},
{
"path": "GPT_SoVITS/text/en_normalization/expend.py",
"chars": 9603,
"preview": "# by https://github.com/Cosmo-klara\n\nfrom __future__ import print_function\n\nimport re\nimport inflect\nimport unicodedata\n"
},
{
"path": "GPT_SoVITS/text/engdict-hot.rep",
"chars": 75,
"preview": "CHATGPT CH AE1 T JH IY1 P IY1 T IY1\nJSON JH EY1 S AH0 N\nCONDA K AA1 N D AH0"
},
{
"path": "GPT_SoVITS/text/english.py",
"chars": 9283,
"preview": "import pickle\nimport os\nimport re\nimport wordsegment\nfrom g2p_en import G2p\n\nfrom text.symbols import punctuation\n\nfrom "
},
{
"path": "GPT_SoVITS/text/g2pw/__init__.py",
"chars": 29,
"preview": "from text.g2pw.g2pw import *\n"
},
{
"path": "GPT_SoVITS/text/g2pw/dataset.py",
"chars": 5793,
"preview": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/g2pw/g2pw.py",
"chars": 4930,
"preview": "# This code is modified from https://github.com/mozillazg/pypinyin-g2pW\n\nimport pickle\nimport os\n\nfrom pypinyin.constant"
},
{
"path": "GPT_SoVITS/text/g2pw/onnx_api.py",
"chars": 9599,
"preview": "# This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw\n# "
},
{
"path": "GPT_SoVITS/text/g2pw/polyphonic-fix.rep",
"chars": 1385749,
"preview": "一丁不识: ['yi1', 'ding1', 'bu4', 'shi2']\n一不小心: ['yi2', 'bu4', 'xiao3', 'xin1']\n一不扭众: ['yi1', 'bu4', 'niu3', 'zhong4']\n一专多能:"
},
{
"path": "GPT_SoVITS/text/g2pw/polyphonic.rep",
"chars": 1159,
"preview": "湖泊: ['hu2','po1']\n地壳: ['di4','qiao4']\n柏树: ['bai3','shu4']\n曝光: ['bao4','guang1']\n弹力: ['tan2','li4']\n字帖: ['zi4','tie4']\n口吃"
},
{
"path": "GPT_SoVITS/text/g2pw/utils.py",
"chars": 4828,
"preview": "# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/japanese.py",
"chars": 9541,
"preview": "# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py\nimport re\nimport os\nimport hashlib\n\ntry:\n"
},
{
"path": "GPT_SoVITS/text/korean.py",
"chars": 10273,
"preview": "# reference: https://github.com/ORI-Muchim/MB-iSTFT-VITS-Korean/blob/main/text/korean.py\r\n\r\nimport re\r\nfrom jamo import "
},
{
"path": "GPT_SoVITS/text/opencpop-strict.txt",
"chars": 4084,
"preview": "a\tAA a\nai\tAA ai\nan\tAA an\nang\tAA ang\nao\tAA ao\nba\tb a\nbai\tb ai\nban\tb an\nbang\tb ang\nbao\tb ao\nbei\tb ei\nben\tb en\nbeng\tb eng\nb"
},
{
"path": "GPT_SoVITS/text/symbols.py",
"chars": 4464,
"preview": "# punctuation = ['!', '?', '…', \",\", \".\",\"@\"]#@是SP停顿\npunctuation = [\"!\", \"?\", \"…\", \",\", \".\"] # @是SP停顿\npunctuation.appen"
},
{
"path": "GPT_SoVITS/text/symbols2.py",
"chars": 9512,
"preview": "# punctuation = ['!', '?', '…', \",\", \".\",\"@\"]#@是SP停顿\npunctuation = [\"!\", \"?\", \"…\", \",\", \".\"] # @是SP停顿\npunctuation.appen"
},
{
"path": "GPT_SoVITS/text/tone_sandhi.py",
"chars": 21595,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/README.md",
"chars": 762,
"preview": "## Supported NSW (Non-Standard-Word) Normalization\n\n|NSW type|raw|normalized|\n|:--|:-|:-|\n|serial number|电影中梁朝伟扮演的陈永仁的编号"
},
{
"path": "GPT_SoVITS/text/zh_normalization/__init__.py",
"chars": 664,
"preview": "# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/char_convert.py",
"chars": 23057,
"preview": "# coding=utf-8\n# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Ve"
},
{
"path": "GPT_SoVITS/text/zh_normalization/chronology.py",
"chars": 3350,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/constants.py",
"chars": 2099,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/num.py",
"chars": 8662,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/phonecode.py",
"chars": 1831,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/quantifier.py",
"chars": 1643,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/text/zh_normalization/text_normlization.py",
"chars": 7272,
"preview": "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the "
},
{
"path": "GPT_SoVITS/utils.py",
"chars": 10951,
"preview": "import argparse\nimport glob\nimport json\nimport logging\nimport os\nimport subprocess\nimport sys\nimport traceback\n\nimport l"
},
{
"path": "LICENSE",
"chars": 1065,
"preview": "MIT License\n\nCopyright (c) 2024 RVC-Boss\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\no"
},
{
"path": "README.md",
"chars": 20791,
"preview": "<div align=\"center\">\n\n<h1>GPT-SoVITS-WebUI</h1>\nA Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>\n\n"
},
{
"path": "api.py",
"chars": 43658,
"preview": "\"\"\"\r\n# api.py usage\r\n\r\n` python api.py -dr \"123.wav\" -dt \"一二三。\" -dl \"zh\" `\r\n\r\n## 执行参数:\r\n\r\n`-s` - `SoVITS模型路径, 可在 config."
},
{
"path": "api_v2.py",
"chars": 21623,
"preview": "\"\"\"\n# WebAPI文档\n\n` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml `\n\n## 执行参数:\n `-a` - `绑定地"
},
{
"path": "config.py",
"chars": 7479,
"preview": "import os\r\nimport re\r\nimport sys\r\n\r\nimport torch\r\n\r\nfrom tools.i18n.i18n import I18nAuto\r\n\r\ni18n = I18nAuto(language=os."
},
{
"path": "docker-compose.yaml",
"chars": 1885,
"preview": "version: \"3.8\"\n\nservices:\n GPT-SoVITS-CU126:\n image: xxxxrt666/gpt-sovits:latest-cu126\n container_name: GPT-SoVIT"
},
{
"path": "docker_build.sh",
"chars": 1826,
"preview": "#!/bin/bash\n\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" &>/dev/null && pwd)\"\n\ncd \"$SCRIPT_DIR\" || exit 1\n\nset -e\n"
},
{
"path": "docs/cn/Changelog_CN.md",
"chars": 28056,
"preview": "# 更新日志\n\n## 202401\n\n- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)\n - 内容: WebUI 增加英文系统英文翻译适配.\n "
},
{
"path": "docs/cn/README.md",
"chars": 15030,
"preview": "<div align=\"center\">\n\n<h1>GPT-SoVITS-WebUI</h1>\n强大的少样本语音转换与语音合成Web用户界面.<br><br>\n\n[\n - Content: Added Englis"
},
{
"path": "docs/ja/Changelog_JA.md",
"chars": 27620,
"preview": "# 更新履歴\n\n## 202401\n\n- 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)\n - 内容: WebUIに英語システム翻訳サポートを追加。"
},
{
"path": "docs/ja/README.md",
"chars": 15910,
"preview": "<div align=\"center\">\n\n<h1>GPT-SoVITS-WebUI</h1>\nパワフルなFew-Shot音声変換・音声合成 WebUI.<br><br>\n\n[\n - 내용: WebUI에 영어 시스템 번역 지원 추"
},
{
"path": "docs/ko/README.md",
"chars": 16097,
"preview": "<div align=\"center\">\n\n<h1>GPT-SoVITS-WebUI</h1>\n소량의 데이터로 음성 변환 및 음성 합성을 지원하는 강력한 WebUI.<br><br>\n\n[\n - İ"
}
]
// ... and 56 more files (download for full content)
About this extraction
This page contains the full source code of the RVC-Boss/GPT-SoVITS GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 256 files (26.8 MB), approximately 2.7M tokens, and a symbol index with 1694 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.