Repository: voicepaw/so-vits-svc-fork Branch: main Commit: 5dfcf10a242f Files: 100 Total size: 465.0 KB Directory structure: gitextract_fwmtssbt/ ├── .all-contributorsrc ├── .codespellrc ├── .copier-answers.yml ├── .dockerignore ├── .editorconfig ├── .flake8 ├── .github/ │ ├── CODE_OF_CONDUCT.md │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── 1-bug-report.yml │ │ ├── 1-bug_report.yml │ │ ├── 2-feature-request.yml │ │ └── config.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── labels.toml │ └── workflows/ │ ├── ci.yml │ ├── hacktoberfest.yml │ ├── issue-manager.yml │ ├── labels.yml │ ├── poetry-upgrade.yml │ └── upgrader.yml ├── .gitignore ├── .gitpod.yml ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── README_zh_CN.md ├── commitlint.config.js ├── commitlint.config.mjs ├── docs/ │ ├── Makefile │ ├── _static/ │ │ └── .gitkeep │ ├── changelog.md │ ├── conf.py │ ├── contributing.md │ ├── index.md │ ├── installation.md │ ├── make.bat │ └── usage.md ├── easy-installation/ │ ├── install-cn.bat │ └── install.bat ├── flake.nix ├── notebooks/ │ └── so-vits-svc-fork-4.0.ipynb ├── pyproject.toml ├── renovate.json ├── setup.py ├── src/ │ └── so_vits_svc_fork/ │ ├── __init__.py │ ├── __main__.py │ ├── cluster/ │ │ ├── __init__.py │ │ └── train_cluster.py │ ├── dataset.py │ ├── default_gui_presets.json │ ├── f0.py │ ├── gui.py │ ├── hparams.py │ ├── inference/ │ │ ├── __init__.py │ │ ├── core.py │ │ └── main.py │ ├── logger.py │ ├── modules/ │ │ ├── __init__.py │ │ ├── attentions.py │ │ ├── commons.py │ │ ├── decoders/ │ │ │ ├── __init__.py │ │ │ ├── f0.py │ │ │ ├── hifigan/ │ │ │ │ ├── __init__.py │ │ │ │ ├── _models.py │ │ │ │ └── _utils.py │ │ │ └── mb_istft/ │ │ │ ├── __init__.py │ │ │ ├── _generators.py │ │ │ ├── _loss.py │ │ │ ├── _pqmf.py │ │ │ ├── _stft.py │ │ │ └── _stft_loss.py │ │ ├── descriminators.py │ │ ├── encoders.py │ │ ├── flows.py │ │ ├── losses.py │ │ ├── mel_processing.py │ │ ├── modules.py │ │ └── synthesizers.py │ ├── preprocessing/ │ │ ├── __init__.py │ │ ├── config_templates/ │ │ │ ├── __init__.py │ │ │ ├── quickvc.json │ │ │ ├── so-vits-svc-4.0v1-legacy.json │ │ │ └── so-vits-svc-4.0v1.json │ │ ├── preprocess_classify.py │ │ ├── preprocess_flist_config.py │ │ ├── preprocess_hubert_f0.py │ │ ├── preprocess_resample.py │ │ ├── preprocess_speaker_diarization.py │ │ ├── preprocess_split.py │ │ └── preprocess_utils.py │ ├── py.typed │ ├── train.py │ └── utils.py ├── templates/ │ └── CHANGELOG.md.j2 └── tests/ ├── __init__.py └── test_main.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .all-contributorsrc ================================================ { "projectName": "so-vits-svc-fork", "projectOwner": "voicepaw", "repoType": "github", "repoHost": "https://github.com", "files": ["README.md"], "imageSize": 80, "commit": true, "commitConvention": "angular", "contributors": [ { "login": "34j", "name": "34j", "avatar_url": "https://avatars.githubusercontent.com/u/55338215?v=4", "profile": "https://github.com/34j", "contributions": [ "code", "ideas", "doc", "example", "infra", "maintenance", "review", "test", "tutorial", "promotion", "bug" ] }, { "login": "GarrettConway", "name": "GarrettConway", "avatar_url": "https://avatars.githubusercontent.com/u/22782004?v=4", "profile": "https://github.com/GarrettConway", "contributions": ["code", "bug", "doc", "review"] }, { "login": "BlueAmulet", "name": "BlueAmulet", "avatar_url": "https://avatars.githubusercontent.com/u/43395286?v=4", "profile": "https://github.com/BlueAmulet", "contributions": ["ideas", "question", "code", "maintenance"] }, { "login": "ThrowawayAccount01", "name": "ThrowawayAccount01", "avatar_url": "https://avatars.githubusercontent.com/u/125531852?v=4", "profile": "https://github.com/ThrowawayAccount01", "contributions": ["bug"] }, { "login": "MashiroSA", "name": "緋", "avatar_url": "https://avatars.githubusercontent.com/u/40637516?v=4", "profile": "https://github.com/MashiroSA", "contributions": ["doc", "bug"] }, { "login": "Lordmau5", "name": "Lordmau5", "avatar_url": "https://avatars.githubusercontent.com/u/1345036?v=4", "profile": "https://github.com/Lordmau5", "contributions": [ "bug", "code", "ideas", "maintenance", "question", "userTesting" ] }, { "login": "DL909", "name": "DL909", "avatar_url": "https://avatars.githubusercontent.com/u/71912115?v=4", "profile": "https://github.com/DL909", "contributions": ["bug"] }, { "login": "Satisfy256", "name": "Satisfy256", "avatar_url": "https://avatars.githubusercontent.com/u/101394399?v=4", "profile": "https://github.com/Satisfy256", "contributions": ["bug"] }, { "login": "pierluigizagaria", "name": "Pierluigi Zagaria", "avatar_url": "https://avatars.githubusercontent.com/u/57801386?v=4", "profile": "https://github.com/pierluigizagaria", "contributions": ["userTesting"] }, { "login": "ruckusmattster", "name": "ruckusmattster", "avatar_url": "https://avatars.githubusercontent.com/u/77196088?v=4", "profile": "https://github.com/ruckusmattster", "contributions": ["bug"] }, { "login": "Desuka-art", "name": "Desuka-art", "avatar_url": "https://avatars.githubusercontent.com/u/111822082?v=4", "profile": "https://github.com/Desuka-art", "contributions": ["bug"] }, { "login": "heyfixit", "name": "heyfixit", "avatar_url": "https://avatars.githubusercontent.com/u/41658450?v=4", "profile": "https://github.com/heyfixit", "contributions": ["doc"] }, { "login": "nerdyrodent", "name": "Nerdy Rodent", "avatar_url": "https://avatars.githubusercontent.com/u/74688049?v=4", "profile": "https://www.youtube.com/c/NerdyRodent", "contributions": ["video"] }, { "login": "xieyumc", "name": "谢宇", "avatar_url": "https://avatars.githubusercontent.com/u/47858007?v=4", "profile": "https://github.com/xieyumc", "contributions": ["doc"] }, { "login": "ColdCawfee", "name": "ColdCawfee", "avatar_url": "https://avatars.githubusercontent.com/u/79474598?v=4", "profile": "https://github.com/ColdCawfee", "contributions": ["bug"] }, { "login": "sbersier", "name": "sbersier", "avatar_url": "https://avatars.githubusercontent.com/u/34165937?v=4", "profile": "https://github.com/sbersier", "contributions": ["ideas", "userTesting", "bug"] }, { "login": "Meldoner", "name": "Meldoner", "avatar_url": "https://avatars.githubusercontent.com/u/43951115?v=4", "profile": "https://github.com/Meldoner", "contributions": ["bug", "ideas", "code"] }, { "login": "mmodeusher", "name": "mmodeusher", "avatar_url": "https://avatars.githubusercontent.com/u/46575920?v=4", "profile": "https://github.com/mmodeusher", "contributions": ["bug"] }, { "login": "AlonDan", "name": "AlonDan", "avatar_url": "https://avatars.githubusercontent.com/u/21152334?v=4", "profile": "https://github.com/AlonDan", "contributions": ["bug"] }, { "login": "Likkkez", "name": "Likkkez", "avatar_url": "https://avatars.githubusercontent.com/u/44336181?v=4", "profile": "https://github.com/Likkkez", "contributions": ["bug"] }, { "login": "DuctTapeGames", "name": "Duct Tape Games", "avatar_url": "https://avatars.githubusercontent.com/u/84365142?v=4", "profile": "https://github.com/DuctTapeGames", "contributions": ["bug"] }, { "login": "hxl9654", "name": "Xianglong He", "avatar_url": "https://avatars.githubusercontent.com/u/6624983?v=4", "profile": "https://tec.hxlxz.com/", "contributions": ["bug"] }, { "login": "75aosu", "name": "75aosu", "avatar_url": "https://avatars.githubusercontent.com/u/79185331?v=4", "profile": "https://github.com/75aosu", "contributions": ["bug"] }, { "login": "tonyco82", "name": "tonyco82", "avatar_url": "https://avatars.githubusercontent.com/u/56610534?v=4", "profile": "https://github.com/tonyco82", "contributions": ["bug"] }, { "login": "yxlllc", "name": "yxlllc", "avatar_url": "https://avatars.githubusercontent.com/u/33565655?v=4", "profile": "https://github.com/yxlllc", "contributions": ["ideas", "code"] }, { "login": "outhipped", "name": "outhipped", "avatar_url": "https://avatars.githubusercontent.com/u/116147475?v=4", "profile": "https://github.com/outhipped", "contributions": ["bug"] }, { "login": "escoolioinglesias", "name": "escoolioinglesias", "avatar_url": "https://avatars.githubusercontent.com/u/73505402?v=4", "profile": "https://github.com/escoolioinglesias", "contributions": ["bug", "userTesting", "video"] }, { "login": "Blacksingh", "name": "Blacksingh", "avatar_url": "https://avatars.githubusercontent.com/u/130872856?v=4", "profile": "https://github.com/Blacksingh", "contributions": ["bug"] }, { "login": "tybantarnusa", "name": "Mgs. M. Thoyib Antarnusa", "avatar_url": "https://avatars.githubusercontent.com/u/9532857?v=4", "profile": "http://tybantarnusa.com", "contributions": ["bug"] }, { "login": "ZeroHackz", "name": "Exosfeer", "avatar_url": "https://avatars.githubusercontent.com/u/15729496?v=4", "profile": "https://github.com/ZeroHackz", "contributions": ["bug", "code"] }, { "login": "guranon", "name": "guranon", "avatar_url": "https://avatars.githubusercontent.com/u/130421189?v=4", "profile": "https://github.com/guranon", "contributions": ["bug", "ideas", "code"] }, { "login": "alexanderkoumis", "name": "Alexander Koumis", "avatar_url": "https://avatars.githubusercontent.com/u/5108856?v=4", "profile": "https://github.com/alexanderkoumis", "contributions": ["code"] }, { "login": "acekagami", "name": "acekagami", "avatar_url": "https://avatars.githubusercontent.com/u/127201056?v=4", "profile": "https://github.com/acekagami", "contributions": ["translation"] }, { "login": "Highupech", "name": "Highupech", "avatar_url": "https://avatars.githubusercontent.com/u/114140670?v=4", "profile": "https://github.com/Highupech", "contributions": ["bug"] }, { "login": "Scorpi", "name": "Scorpi", "avatar_url": "https://avatars.githubusercontent.com/u/969654?v=4", "profile": "https://github.com/Scorpi", "contributions": ["code"] }, { "login": "maximxlss", "name": "Maximxls", "avatar_url": "https://avatars.githubusercontent.com/u/29152154?v=4", "profile": "http://maximxlss.github.io", "contributions": ["code"] }, { "login": "Star3Lord", "name": "Star3Lord", "avatar_url": "https://avatars.githubusercontent.com/u/57606931?v=4", "profile": "https://github.com/Star3Lord", "contributions": ["bug", "code"] }, { "login": "Ph0rk0z", "name": "Forkoz", "avatar_url": "https://avatars.githubusercontent.com/u/59298527?v=4", "profile": "https://github.com/Ph0rk0z", "contributions": ["bug", "code"] }, { "login": "Zerui18", "name": "Zerui Chen", "avatar_url": "https://avatars.githubusercontent.com/u/34794550?v=4", "profile": "https://github.com/Zerui18", "contributions": ["code", "ideas"] }, { "login": "shenberg", "name": "Roee Shenberg", "avatar_url": "https://avatars.githubusercontent.com/u/653972?v=4", "profile": "https://www.meimadix.com", "contributions": ["userTesting", "ideas", "code"] }, { "login": "ShinyJustyZ", "name": "Justas", "avatar_url": "https://avatars.githubusercontent.com/u/65282440?v=4", "profile": "https://github.com/ShinyJustyZ", "contributions": ["bug", "code"] }, { "login": "Onako2", "name": "Onako2", "avatar_url": "https://avatars.githubusercontent.com/u/79749977?v=4", "profile": "https://onako2.github.io/", "contributions": ["doc"] }, { "login": "4ll0w3v1l", "name": "4ll0w3v1l", "avatar_url": "https://avatars.githubusercontent.com/u/53517147?v=4", "profile": "https://github.com/4ll0w3v1l", "contributions": ["code"] }, { "login": "SamuelSwartzberg", "name": "j5y0V6b", "avatar_url": "https://avatars.githubusercontent.com/u/16353439?v=4", "profile": "https://github.com/SamuelSwartzberg", "contributions": ["security"] }, { "login": "marcellocirelli", "name": "marcellocirelli", "avatar_url": "https://avatars.githubusercontent.com/u/51972090?v=4", "profile": "https://github.com/marcellocirelli", "contributions": ["bug"] }, { "login": "Priyanshu-hawk", "name": "Priyanshu Patel", "avatar_url": "https://avatars.githubusercontent.com/u/76026651?v=4", "profile": "https://github.com/Priyanshu-hawk", "contributions": ["code"] }, { "login": "annagorshunova", "name": "Anna Gorshunova", "avatar_url": "https://avatars.githubusercontent.com/u/5199204?v=4", "profile": "https://github.com/annagorshunova", "contributions": ["bug", "code"] } ], "contributorsPerLine": 7, "skipCi": true, "commitType": "docs" } ================================================ FILE: .codespellrc ================================================ [codespell] ignore-words-list = socio-economic ================================================ FILE: .copier-answers.yml ================================================ # Changes here will be overwritten by Copier _commit: 2e4f7d0 _src_path: gh:34j/pypackage-template copyright_year: '2023' documentation: true email: 34j.95a2p@simplelogin.com full_name: 34j github_username: voicepaw has_cli: false initial_commit: false is_django_package: false open_source_license: MIT open_with_editor: false package_name: so_vits_svc_fork project_name: SoftVC VITS Singing Voice Conversion Fork project_short_description: A fork of so-vits-svc. project_slug: so-vits-svc-fork run_uv_sync: false setup_pre_commit: false ================================================ FILE: .dockerignore ================================================ # Ignore everything * ================================================ FILE: .editorconfig ================================================ # http://editorconfig.org root = true [*] indent_style = space indent_size = 4 trim_trailing_whitespace = true insert_final_newline = true charset = utf-8 end_of_line = lf [*.bat] indent_style = tab end_of_line = crlf [LICENSE] insert_final_newline = false [Makefile] indent_style = tab ================================================ FILE: .flake8 ================================================ [flake8] exclude = docs max-line-length = 88 ignore = E203, E501, E741, E402, E712, W503, E731, E711, E226 ================================================ FILE: .github/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: - Demonstrating empathy and kindness toward other people - Being respectful of differing opinions, viewpoints, and experiences - Giving and gracefully accepting constructive feedback - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience - Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: - The use of sexualized language or imagery, and sexual attention or advances of any kind - Trolling, insulting or derogatory comments, and personal or political attacks - Public or private harassment - Publishing others' private information, such as a physical or email address, without their explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting @voicepaw. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.0, available at https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see the FAQ at https://www.contributor-covenant.org/faq. Translations are available at https://www.contributor-covenant.org/translations. ================================================ FILE: .github/FUNDING.yml ================================================ github: ["voicepaw"] ================================================ FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml ================================================ name: Bug report description: Create a report to help us improve labels: [bug] body: - type: textarea id: description attributes: label: Describe the bug description: A clear and concise description of what the bug is. placeholder: Describe the bug validations: required: true - type: textarea id: reproduce attributes: label: To Reproduce description: Steps to reproduce the behavior. placeholder: To Reproduce validations: required: true - type: textarea id: context attributes: label: Additional context description: Add any other context about the problem here. placeholder: Additional context - type: input id: version attributes: label: Version description: Version of the project. placeholder: Version validations: required: true - type: input id: platform attributes: label: Platform description: Platform where the bug was found. placeholder: "Example: Windows 11 / macOS 12.0.1 / Ubuntu 20.04" validations: required: true - type: checkboxes id: terms attributes: label: Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/voicepaw/so-vits-svc-fork/blob/main/.github/CODE_OF_CONDUCT.md). options: - label: I agree to follow this project's Code of Conduct. required: true - type: checkboxes id: no-duplicate attributes: label: No Duplicate description: Please check [existing issues](https://github.com/voicepaw/so-vits-svc-fork/issues) to avoid duplicates. options: - label: I have checked existing issues to avoid duplicates. required: true - type: markdown attributes: value: 👋 Have a great day and thank you for the bug report! ================================================ FILE: .github/ISSUE_TEMPLATE/1-bug_report.yml ================================================ name: Bug report description: Create a report to help us improve labels: [bug] body: - type: textarea id: description attributes: label: Describe the bug description: A clear and concise description of what the bug is. placeholder: Describe the bug validations: required: true - type: textarea id: reproduce attributes: label: To Reproduce description: Steps to reproduce the behavior. placeholder: To Reproduce validations: required: true - type: textarea id: context attributes: label: Additional context description: Add any other context about the problem here. placeholder: Additional context - type: input id: version attributes: label: Version description: Version of the project. placeholder: Version validations: required: true - type: input id: platform attributes: label: Platform description: Platform where the bug was found. placeholder: "Example: Windows 11 / macOS 12.0.1 / Ubuntu 20.04" validations: required: true - type: checkboxes id: terms attributes: label: Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/34j/so-vits-svc-fork/blob/main/CODE_OF_CONDUCT.md). options: - label: I agree to follow this project's Code of Conduct. required: true - type: checkboxes id: no-duplicate attributes: label: No Duplicate description: Please check [existing issues](https://github.com/34j/so-vits-svc-fork/issues) to avoid duplicates. options: - label: I have checked existing issues to avoid duplicates. required: true ================================================ FILE: .github/ISSUE_TEMPLATE/2-feature-request.yml ================================================ name: Feature request description: Suggest an idea for this project labels: [enhancement] body: - type: textarea id: description attributes: label: Is your feature request related to a problem? Please describe. description: A clear and concise description of what the problem is. value: I'm always frustrated when validations: required: true - type: textarea id: solution attributes: label: Describe alternatives you've considered description: A clear and concise description of any alternative solutions or features you've considered. placeholder: Describe alternatives you've considered validations: required: true - type: textarea id: context attributes: label: Additional context description: Add any other context or screenshots about the feature request here. placeholder: Additional context - type: checkboxes id: terms attributes: label: Code of Conduct description: By submitting this issue, you agree to follow our [Code of Conduct](https://github.com/voicepaw/so-vits-svc-fork/blob/main/.github/CODE_OF_CONDUCT.md). options: - label: I agree to follow this project's Code of Conduct required: true - type: checkboxes id: willing attributes: label: Are you willing to resolve this issue by submitting a Pull Request? description: Remember that first-time contributors are welcome! 🙌 options: - label: Yes, I have the time, and I know how to start. - label: Yes, I have the time, but I don't know how to start. I would need guidance. - label: No, I don't have the time, although I believe I could do it if I had the time... - label: No, I don't have the time and I wouldn't even know how to start. - type: markdown attributes: value: 👋 Have a great day and thank you for the feature request! ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ # Disabling blank issues to ensure all necessary information is provided # Users should use the provided templates for specific issues # For general questions, please refer to the contact links section blank_issues_enabled: false contact_links: - name: Questions url: https://github.com/voicepaw/so-vits-svc-fork/discussions/categories/q-a about: Please ask and answer questions here. ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ### Description of change ### Pull-Request Checklist - [ ] Code is up-to-date with the `main` branch - [ ] This pull request follows the [contributing guidelines](https://github.com/voicepaw/so-vits-svc-fork/blob/main/CONTRIBUTING.md). - [ ] This pull request links relevant issues as `Fixes #0000` - [ ] There are new or updated unit tests validating the change - [ ] Documentation has been updated to reflect this change - [ ] The new commits follow conventions outlined in the [conventional commit spec](https://www.conventionalcommits.org/en/v1.0.0/), such as "fix(api): prevent racing of requests". > - If pre-commit.ci is failing, try `pre-commit run -a` for further information. > - If CI / test is failing, try `uv run pytest` for further information. ================================================ FILE: .github/labels.toml ================================================ [breaking] color = "ffcc00" name = "breaking" description = "Breaking change." [bug] color = "d73a4a" name = "bug" description = "Something isn't working" [dependencies] color = "0366d6" name = "dependencies" description = "Pull requests that update a dependency file" [github_actions] color = "000000" name = "github_actions" description = "Update of github actions" [documentation] color = "1bc4a5" name = "documentation" description = "Improvements or additions to documentation" [duplicate] color = "cfd3d7" name = "duplicate" description = "This issue or pull request already exists" [enhancement] color = "a2eeef" name = "enhancement" description = "New feature or request" ["good first issue"] color = "7057ff" name = "good first issue" description = "Good for newcomers" ["help wanted"] color = "008672" name = "help wanted" description = "Extra attention is needed" [invalid] color = "e4e669" name = "invalid" description = "This doesn't seem right" [nochangelog] color = "555555" name = "nochangelog" description = "Exclude pull requests from changelog" [question] color = "d876e3" name = "question" description = "Further information is requested" [removed] color = "e99695" name = "removed" description = "Removed piece of functionalities." [tests] color = "bfd4f2" name = "tests" description = "CI, CD and testing related changes" [wontfix] color = "ffffff" name = "wontfix" description = "This will not be worked on" [discussion] color = "c2e0c6" name = "discussion" description = "Some discussion around the project" [hacktoberfest] color = "ffa663" name = "hacktoberfest" description = "Good issues for Hacktoberfest" [answered] color = "0ee2b6" name = "answered" description = "Automatically closes as answered after a delay" [waiting] color = "5f7972" name = "waiting" description = "Automatically closes if no answer after a delay" [fund] color = "0E8A16" name = "fund" description = "Add a section linking to polar.sh for funding the issue." ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: - main pull_request: concurrency: group: ${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.x - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 # Make sure commit messages follow the conventional commits convention: # https://www.conventionalcommits.org commitlint: name: Lint Commit Messages runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: fetch-depth: 0 - uses: wagoid/commitlint-github-action@b948419dd99f3fd78a6548d48f94e3df7f6bf3ed # v6.2.1 test: strategy: fail-fast: false matrix: python-version: # - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" os: - ubuntu-latest # - windows-latest # - macOS-latest runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 id: setup-python with: python-version: ${{ matrix.python-version }} - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7 - run: uv sync --no-python-downloads shell: bash - run: uv run pytest shell: bash - uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5 with: token: ${{ secrets.CODECOV_TOKEN }} release: needs: - test - lint - commitlint runs-on: ubuntu-latest environment: release concurrency: release permissions: id-token: write attestations: write contents: write steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: fetch-depth: 0 ref: ${{ github.sha }} - name: Checkout commit for release run: | git checkout -B ${{ github.ref_name }} ${{ github.sha }} # Do a dry run of PSR - name: Test release uses: python-semantic-release/python-semantic-release@350c48fcb3ffcdfd2e0a235206bc2ecea6b69df0 # v10 if: github.ref_name != 'main' with: root_options: --noop github_token: noop # On main branch: actual PSR + upload to PyPI & GitHub - name: Release uses: python-semantic-release/python-semantic-release@350c48fcb3ffcdfd2e0a235206bc2ecea6b69df0 # v10 id: release if: github.ref_name == 'main' with: github_token: ${{ secrets.GITHUB_TOKEN }} - name: Attest build provenance uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4 if: steps.release.outputs.released == 'true' with: subject-path: "dist/*" - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@release/v1 if: steps.release.outputs.released == 'true' - name: Publish package distributions to GitHub Releases uses: python-semantic-release/publish-action@310a9983a0ae878b29f3aac778d7c77c1db27378 # v10 if: steps.release.outputs.released == 'true' with: github_token: ${{ secrets.GITHUB_TOKEN }} tag: ${{ steps.release.outputs.tag }} ================================================ FILE: .github/workflows/hacktoberfest.yml ================================================ name: Hacktoberfest on: schedule: # Run every day in October - cron: "0 0 * 10 *" # Run on the 1st of November to revert - cron: "0 13 1 11 *" jobs: hacktoberfest: runs-on: ubuntu-latest steps: - uses: browniebroke/hacktoberfest-labeler-action@72564cc2b8f1cd239fb6880cca150a1b8b6b027b # v2.6.0 with: github_token: ${{ secrets.GH_PAT }} ================================================ FILE: .github/workflows/issue-manager.yml ================================================ name: Issue Manager on: schedule: - cron: "0 0 * * *" issue_comment: types: - created issues: types: - labeled pull_request_target: types: - labeled workflow_dispatch: jobs: issue-manager: runs-on: ubuntu-latest steps: - uses: tiangolo/issue-manager@2fb3484ec9279485df8659e8ec73de262431737d # 0.6.0 with: token: ${{ secrets.GITHUB_TOKEN }} config: > { "answered": { "message": "Assuming the original issue was solved, it will be automatically closed now." }, "waiting": { "message": "Automatically closing. To re-open, please provide the additional information requested." } } ================================================ FILE: .github/workflows/labels.yml ================================================ name: Sync Github labels on: push: branches: - main paths: - ".github/**" jobs: labels: runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - name: Set up Python uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.x - name: Install labels run: pip install labels - name: Sync config with Github run: labels -u ${{ github.repository_owner }} -t ${{ secrets.GH_PAT }} sync -f .github/labels.toml ================================================ FILE: .github/workflows/poetry-upgrade.yml ================================================ name: Upgrader on: workflow_dispatch: schedule: - cron: "29 23 16 * *" jobs: upgrade: uses: browniebroke/github-actions/.github/workflows/poetry-upgrade.yml@a4a8428c6f76ab8848c94c5a649fa809aacf8688 # v1 secrets: gh_pat: ${{ secrets.GH_PAT }} ================================================ FILE: .github/workflows/upgrader.yml ================================================ name: Upgrader on: workflow_dispatch: schedule: - cron: "15 11 3 1-9,11-12 *" jobs: upgrade: uses: browniebroke/github-actions/.github/workflows/uv-upgrade.yml@a4a8428c6f76ab8848c94c5a649fa809aacf8688 # v1 secrets: gh_pat: ${{ secrets.GH_PAT }} ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder {{package_name}} settings .spyderproject .spyproject # Rope {{package_name}} settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # additional files tests/**/*.wav !tests/dataset_raw/test/**/*.wav tests/**/*.npy tests/**/*.pt tests/**/*.txt tests/**/*.json tests/**/*.pth tests/**/*.download tests/**/*.lab tests/**/*.pdf tests/**/*.csv tests/**/*.ckpt tests/**/*.yaml *.tfevents.* *.pt user_gui_presets.json logs dataset dataset_raw configs filelists ================================================ FILE: .gitpod.yml ================================================ tasks: - command: | pip install uv PIP_USER=false uv sync - command: | pip install pre-commit pre-commit install PIP_USER=false pre-commit install-hooks ================================================ FILE: .pre-commit-config.yaml ================================================ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks exclude: "CHANGELOG.md|.copier-answers.yml|.all-contributorsrc|project" default_stages: [pre-commit] ci: autofix_commit_msg: "chore(pre-commit.ci): auto fixes" autoupdate_commit_msg: "chore(pre-commit.ci): pre-commit autoupdate" repos: - repo: https://github.com/commitizen-tools/commitizen rev: v4.13.9 hooks: - id: commitizen stages: [commit-msg] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: debug-statements - id: check-builtin-literals - id: check-case-conflict - id: check-docstring-first - id: check-json - id: check-toml - id: check-xml - id: check-yaml - id: detect-private-key - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/tox-dev/pyproject-fmt rev: "v2.20.0" hooks: - id: pyproject-fmt - repo: https://github.com/astral-sh/uv-pre-commit rev: 0.10.12 hooks: - id: uv-lock - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.1.0 hooks: - id: prettier args: ["--tab-width", "2"] - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.14 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - id: ruff-format - repo: https://github.com/codespell-project/codespell rev: v2.4.2 hooks: - id: codespell # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.15.0 # hooks: # - id: mypy # additional_dependencies: [] ================================================ FILE: .readthedocs.yml ================================================ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: python: "3.12" commands: - asdf plugin add uv - asdf install uv latest - asdf global uv latest - uv sync --only-group docs --frozen - uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs $READTHEDOCS_OUTPUT/html # Build documentation in the docs directory with Sphinx sphinx: configuration: docs/conf.py ================================================ FILE: CHANGELOG.md ================================================ # Changelog ## v4.2.30 (2026-02-02) ### Bug fixes - Fix `.json` files not included ([`922beed`](https://github.com/voicepaw/so-vits-svc-fork/commit/922beedff7d1efd7d54c75d92f2e090e18c58369)) ## v4.2.29 (2025-10-27) ### Bug fixes - Fix train not working ([`f90cc40`](https://github.com/voicepaw/so-vits-svc-fork/commit/f90cc40802a56ebb3a8ba1f1493ff8d6008fa57b)) ### Documentation - Better notebook ([`a80a296`](https://github.com/voicepaw/so-vits-svc-fork/commit/a80a296166ed0a872f93fc30f504b3a504e11f9e)) ## v4.2.28 (2025-10-26) ### Documentation - Better notebook ([`b3e9fe3`](https://github.com/voicepaw/so-vits-svc-fork/commit/b3e9fe3b6069ee0846701111c4dbc9c69924fbc6)) ### Bug fixes - Fix config templates not included ([`319ba6e`](https://github.com/voicepaw/so-vits-svc-fork/commit/319ba6e0ef2ee61c3f096e3e8e2c58665da42c8c)) ## v4.2.27 (2025-09-10) ### Bug fixes - Run copier recopy ([`b806ddb`](https://github.com/voicepaw/so-vits-svc-fork/commit/b806ddb4e14f2e82ad9349596d776bfdbd3ce4b7)) - Remove onnx deps ([`021c959`](https://github.com/voicepaw/so-vits-svc-fork/commit/021c95936ca1b459e79fc14e4d801ffccb48346a)) ### Documentation - Update civitai model url ([`0f015e3`](https://github.com/voicepaw/so-vits-svc-fork/commit/0f015e32aada5cf7481f91bbe6758e574c9c5f39)) ## v4.2.26 (2024-07-29) ### Bug fixes - Update dependency transformers to v4.43.3 ([`bd9262f`](https://github.com/voicepaw/so-vits-svc-fork/commit/bd9262f546eb9aaa8d9f9641f2d1faa361cf8ea8)) ## v4.2.25 (2024-07-29) ### Bug fixes - Update dependency torch to v2.4.0 ([`20549f6`](https://github.com/voicepaw/so-vits-svc-fork/commit/20549f6f4e1f59090d6bbfe45c43f62613effa0e)) ## v4.2.24 (2024-07-18) ### Bug fixes - Update dependency transformers to v4.42.4 ([`f949a07`](https://github.com/voicepaw/so-vits-svc-fork/commit/f949a071b542b4b699aaa39cf4cfb39d0b53950b)) ## v4.2.23 (2024-07-18) ### Bug fixes - Update dependency lightning to v2.3.3 ([`31edf05`](https://github.com/voicepaw/so-vits-svc-fork/commit/31edf05234d72401db02d994f27d611c4015a65b)) ## v4.2.22 (2024-07-18) ### Bug fixes - Update dependency fastapi to v0.111.1 ([`59ed5f3`](https://github.com/voicepaw/so-vits-svc-fork/commit/59ed5f32e67d4bb96fdd7b2bb606d1ce9e4bb9f0)) ## v4.2.21 (2024-07-04) ### Bug fixes - Update dependency transformers to v4.42.3 ([`b9c031c`](https://github.com/voicepaw/so-vits-svc-fork/commit/b9c031c6814c12c9d5e04ea19745b67f41f8e9ae)) ## v4.2.20 (2024-07-04) ### Bug fixes - Update dependency tensorboard to v2.17.0 ([`e5f3c13`](https://github.com/voicepaw/so-vits-svc-fork/commit/e5f3c1354dcda41c1fa3e518d0d5bc204800f03c)) ## v4.2.19 (2024-07-04) ### Bug fixes - Update dependency lightning to v2.3.2 ([`a7e299f`](https://github.com/voicepaw/so-vits-svc-fork/commit/a7e299ff882c5854ac4be88d21fe95ed1a159711)) ## v4.2.18 (2024-07-04) ### Bug fixes - Update dependency matplotlib to v3.9.1 ([`df6adf4`](https://github.com/voicepaw/so-vits-svc-fork/commit/df6adf461d2174b92ccc0aa6ee4b02a1c9e4634e)) ## v4.2.17 (2024-07-04) ### Bug fixes - Update dependency lightning to v2.3.1 ([`89da16b`](https://github.com/voicepaw/so-vits-svc-fork/commit/89da16bd89ac08c07334156d28ab7dac29a0f01e)) ## v4.2.16 (2024-07-04) ### Bug fixes - Update dependency scipy to v1.14.0 ([`45a1167`](https://github.com/voicepaw/so-vits-svc-fork/commit/45a1167f9d09a822e9dca2b497bed08edca6e919)) ## v4.2.15 (2024-07-03) ### Bug fixes - Update dependency torchcrepe to v0.0.23 ([`2d76d82`](https://github.com/voicepaw/so-vits-svc-fork/commit/2d76d82df14afc3ec6b89770997f267237f98d53)) ## v4.2.14 (2024-07-03) ### Bug fixes - Update dependency torch to v2.3.1 ([`cc51418`](https://github.com/voicepaw/so-vits-svc-fork/commit/cc514182b48a133ed2da249f3d3dc65b28870e74)) ## v4.2.13 (2024-07-03) ### Bug fixes - Update dependency sounddevice to v0.4.7 ([`4df53c2`](https://github.com/voicepaw/so-vits-svc-fork/commit/4df53c22579c9bfe236953bfe238dde0179cfaca)) ## v4.2.12 (2024-07-03) ### Bug fixes - Update dependency requests to v2.32.3 ([`e60876a`](https://github.com/voicepaw/so-vits-svc-fork/commit/e60876ab2c883ca1accb9488a5ee17232d4e4ce7)) ## v4.2.11 (2024-07-02) ### Bug fixes - Update dependency onnx to v1.16.1 ([`0d7ed17`](https://github.com/voicepaw/so-vits-svc-fork/commit/0d7ed171011bdcdf4ec701d1df53573ced09ddbf)) ### Documentation - Update docs ([`94b2412`](https://github.com/voicepaw/so-vits-svc-fork/commit/94b2412f95ee6cb194c37ae558c9ca03b23402db)) - Update docs ([`94b2412`](https://github.com/voicepaw/so-vits-svc-fork/commit/94b2412f95ee6cb194c37ae558c9ca03b23402db)) - Update docs ([`94b2412`](https://github.com/voicepaw/so-vits-svc-fork/commit/94b2412f95ee6cb194c37ae558c9ca03b23402db)) - Update docs ([`94b2412`](https://github.com/voicepaw/so-vits-svc-fork/commit/94b2412f95ee6cb194c37ae558c9ca03b23402db)) ## v4.2.10 (2024-07-02) ### Bug fixes - Replace pysimplegui with pysimplegui-4-foss ([`34e2e77`](https://github.com/voicepaw/so-vits-svc-fork/commit/34e2e77a7f258e09f4661a96645a5f79d761cbed)) ## v4.2.9 (2024-05-23) ### Bug fixes - Update dependency transformers to v4.41.1 ([`42c69fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/42c69fd48146f6b43f9dbfac53339ad573d61acd)) ## v4.2.8 (2024-05-22) ### Bug fixes - Update dependency lightning to v2.2.5 ([`6a457dc`](https://github.com/voicepaw/so-vits-svc-fork/commit/6a457dc4996220cebe0ce54d7f116873f1cf94f3)) ## v4.2.7 (2024-05-22) ### Bug fixes - Update dependency requests to v2.32.2 ([`28e1be1`](https://github.com/voicepaw/so-vits-svc-fork/commit/28e1be1ef191badbe314cf232e932646fd6811d1)) ## v4.2.6 (2024-05-18) ### Bug fixes - Update dependency transformers to v4.41.0 ([`9d20b50`](https://github.com/voicepaw/so-vits-svc-fork/commit/9d20b509e210d20cb7005a58c6408830522b94cf)) ## v4.2.5 (2024-05-16) ### Bug fixes - Update dependency matplotlib to v3.9.0 ([`ed95519`](https://github.com/voicepaw/so-vits-svc-fork/commit/ed9551956bbae36164f9404bad87ac78d7a326c5)) ## v4.2.4 (2024-05-16) ### Bug fixes - Update dependency tqdm-joblib to ^0.0.4 ([`06ea73c`](https://github.com/voicepaw/so-vits-svc-fork/commit/06ea73cd3a82cc058df5b5973aa6edf97d4d708e)) ## v4.2.3 (2024-05-10) ### Bug fixes - Update dependency fastapi to v0.111.0 ([`ee70d52`](https://github.com/voicepaw/so-vits-svc-fork/commit/ee70d522ab1943513517d5068e17c1e5578b09ce)) ## v4.2.2 (2024-05-10) ### Bug fixes - Fix format selection for the input audio in non-windows ([`8168cb4`](https://github.com/voicepaw/so-vits-svc-fork/commit/8168cb404648c23e3ac5f3d2418bf38a606710e4)) - Fix format selection for the input audio in non-windows ([`8168cb4`](https://github.com/voicepaw/so-vits-svc-fork/commit/8168cb404648c23e3ac5f3d2418bf38a606710e4)) ## v4.2.1 (2024-05-10) ### Bug fixes - Support python 3.12, end support for python 3.8, explicitly specify click as a dependency, update deps ([`a7ceffa`](https://github.com/voicepaw/so-vits-svc-fork/commit/a7ceffa57566082f2a4ce9842be236505681d629)) ### Documentation - Replace 3.10 with 3.11 ([`a7ceffa`](https://github.com/voicepaw/so-vits-svc-fork/commit/a7ceffa57566082f2a4ce9842be236505681d629)) ## v4.2.0 (2024-04-11) ### Features - Add leading zeros for 4-digit width of the output file name's numeric part #1154 ([`41b147f`](https://github.com/voicepaw/so-vits-svc-fork/commit/41b147f6c20873fc1cfeaae50d27b7b80d5fdeb6)) ### Documentation - Add annagorshunova as a contributor for bug, and code ([`50f6d79`](https://github.com/voicepaw/so-vits-svc-fork/commit/50f6d79f81d443c3dea9a4de3c65dca6988080ac)) - Update readme.md [skip ci] ([`50f6d79`](https://github.com/voicepaw/so-vits-svc-fork/commit/50f6d79f81d443c3dea9a4de3c65dca6988080ac)) - Update .all-contributorsrc [skip ci] ([`50f6d79`](https://github.com/voicepaw/so-vits-svc-fork/commit/50f6d79f81d443c3dea9a4de3c65dca6988080ac)) ### Bug fixes - Set speaker-diarization version to 3.1 for pyannote.audio 3.1.1 compatibility ([`9bd3089`](https://github.com/voicepaw/so-vits-svc-fork/commit/9bd3089d87be0c4e7bd0fbed51c06c203ad55474)) ## v4.1.61 (2024-04-06) ### Bug fixes - Update dependency fastapi to v0.110.1 ([`eab647c`](https://github.com/voicepaw/so-vits-svc-fork/commit/eab647c8e21b954aa082b8319f084ae080105180)) ### Documentation - Add priyanshu-hawk as a contributor for code ([`d6888db`](https://github.com/voicepaw/so-vits-svc-fork/commit/d6888db4204f87b7075d41371edf08c050179912)) - Update readme.md [skip ci] ([`d6888db`](https://github.com/voicepaw/so-vits-svc-fork/commit/d6888db4204f87b7075d41371edf08c050179912)) - Update .all-contributorsrc [skip ci] ([`d6888db`](https://github.com/voicepaw/so-vits-svc-fork/commit/d6888db4204f87b7075d41371edf08c050179912)) - Add marcellocirelli as a contributor for bug ([`8795709`](https://github.com/voicepaw/so-vits-svc-fork/commit/879570933831cdee3f325c94fc5b4e3fd172990f)) - Update readme.md [skip ci] ([`8795709`](https://github.com/voicepaw/so-vits-svc-fork/commit/879570933831cdee3f325c94fc5b4e3fd172990f)) - Update .all-contributorsrc [skip ci] ([`8795709`](https://github.com/voicepaw/so-vits-svc-fork/commit/879570933831cdee3f325c94fc5b4e3fd172990f)) ## v4.1.60 (2024-04-06) ### Documentation - Add description of repository maintenance status ([`3f537b0`](https://github.com/voicepaw/so-vits-svc-fork/commit/3f537b0919c0e651297c190ede9eb3c03782f319)) - Add samuelswartzberg as a contributor for security ([`cddb722`](https://github.com/voicepaw/so-vits-svc-fork/commit/cddb72236f00d00a566a27a0243b71abbd615c64)) - Update readme.md [skip ci] ([`cddb722`](https://github.com/voicepaw/so-vits-svc-fork/commit/cddb72236f00d00a566a27a0243b71abbd615c64)) - Update .all-contributorsrc [skip ci] ([`cddb722`](https://github.com/voicepaw/so-vits-svc-fork/commit/cddb72236f00d00a566a27a0243b71abbd615c64)) - Update pytorch urls ([`c0c5376`](https://github.com/voicepaw/so-vits-svc-fork/commit/c0c537639c72455328f98d147c06bd8f86030399)) - Add 4ll0w3v1l as a contributor for code ([`df699c7`](https://github.com/voicepaw/so-vits-svc-fork/commit/df699c7284149f79238b783f530d058b2a272447)) - Update readme.md [skip ci] ([`df699c7`](https://github.com/voicepaw/so-vits-svc-fork/commit/df699c7284149f79238b783f530d058b2a272447)) - Update .all-contributorsrc [skip ci] ([`df699c7`](https://github.com/voicepaw/so-vits-svc-fork/commit/df699c7284149f79238b783f530d058b2a272447)) ### Bug fixes - Disallow pysimplegui>=5, update deps, update pytorch urls in readme.md ([`c0c5376`](https://github.com/voicepaw/so-vits-svc-fork/commit/c0c537639c72455328f98d147c06bd8f86030399)) - Disallow pysimplegui>=5 ([`c0c5376`](https://github.com/voicepaw/so-vits-svc-fork/commit/c0c537639c72455328f98d147c06bd8f86030399)) ## v4.1.59 (2024-04-06) ### Bug fixes - Fix broken scipy imports in _pqmf.py ([`b7639ca`](https://github.com/voicepaw/so-vits-svc-fork/commit/b7639ca3a2b283f371a14ce176fe5d0e1d74581e)) ## v4.1.58 (2024-03-25) ### Bug fixes - Update dependency transformers to v4.39.1 ([`a274333`](https://github.com/voicepaw/so-vits-svc-fork/commit/a274333e764ea56aa099033de24279619b4f2210)) ## v4.1.57 (2024-03-25) ### Bug fixes - Update dependency pebble to v5.0.7 ([`e14b62f`](https://github.com/voicepaw/so-vits-svc-fork/commit/e14b62f11f8ed245a05c663381b086e92f76f2c6)) ## v4.1.56 (2024-03-05) ### Bug fixes - Update dependency lightning to v2.2.1 ([`a84d26b`](https://github.com/voicepaw/so-vits-svc-fork/commit/a84d26ba6614c3cf1ca3415ee5131e77867f5d10)) ## v4.1.55 (2024-03-04) ### Bug fixes - Update dependency onnxsim to v0.4.36 ([`12761e8`](https://github.com/voicepaw/so-vits-svc-fork/commit/12761e8989f43864b9f35f1dc144f5bc4dea1ac0)) ## v4.1.54 (2024-03-03) ### Bug fixes - Update dependency transformers to v4.38.2 ([`cfc4edb`](https://github.com/voicepaw/so-vits-svc-fork/commit/cfc4edb570d5381f044cc9db51f291744c118f87)) ## v4.1.53 (2024-02-28) ### Bug fixes - Update dependency rich to v13.7.1 ([`21f33d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/21f33d6494f09b62e2b97ceb356be7d6fa6560bc)) ## v4.1.52 (2024-02-25) ### Bug fixes - Update dependency fastapi to v0.110.0 ([`29fc759`](https://github.com/voicepaw/so-vits-svc-fork/commit/29fc7592dae3a16c310a159ebe94df5f64ac2271)) ## v4.1.51 (2024-02-23) ### Bug fixes - Update dependency torch to v2.2.1 ([`bbc73c1`](https://github.com/voicepaw/so-vits-svc-fork/commit/bbc73c1b15608a8d4b1cf564ac2183044a94bdc6)) ## v4.1.50 (2024-02-22) ### Bug fixes - Update dependency transformers to v4.38.1 ([`c90cfee`](https://github.com/voicepaw/so-vits-svc-fork/commit/c90cfee4dbcd29f6fd54193d506232c4a1ab0fe7)) ## v4.1.49 (2024-02-21) ### Bug fixes - Update dependency transformers to v4.38.0 ([`4dec304`](https://github.com/voicepaw/so-vits-svc-fork/commit/4dec3048ed3fd208ed9b24dfe2e17338adcc8253)) ## v4.1.48 (2024-02-16) ### Bug fixes - Update dependency matplotlib to v3.8.3 ([`e8eab7f`](https://github.com/voicepaw/so-vits-svc-fork/commit/e8eab7f9fc47c1ddc7c2753705abfdbafbc53f69)) ## v4.1.47 (2024-02-10) ### Bug fixes - Update dependency tqdm to v4.66.2 ([`4516483`](https://github.com/voicepaw/so-vits-svc-fork/commit/451648353d5d473dfa058d75ce4953db67422506)) ## v4.1.46 (2024-02-08) ### Bug fixes - Update dependency lightning to v2.2.0 ([`f7b2a42`](https://github.com/voicepaw/so-vits-svc-fork/commit/f7b2a427f11cab439b03ec6ec87a5794b184aa57)) ## v4.1.45 (2024-02-05) ### Bug fixes - Update dependency fastapi to v0.109.2 ([`c570f8e`](https://github.com/voicepaw/so-vits-svc-fork/commit/c570f8e37b7c1b9ab0faada3c4f7f37a7e8fe896)) ## v4.1.44 (2024-02-03) ### Bug fixes - Update dependency fastapi to v0.109.1 ([`6ee83d5`](https://github.com/voicepaw/so-vits-svc-fork/commit/6ee83d5931c2e2f5f3658ce96a83bec53e6e1d73)) ## v4.1.43 (2024-02-02) ### Bug fixes - Update dependency lightning to v2.1.4 ([`33334fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/33334fd9a0e112a811b5ad90cedc0e1929f10e89)) ## v4.1.42 (2024-01-30) ### Bug fixes - Update dependency torch to v2.2.0 ([`8750059`](https://github.com/voicepaw/so-vits-svc-fork/commit/875005917101170e755b4dca7fe223436fb3e41e)) ## v4.1.41 (2024-01-29) ### Bug fixes - Update dependency transformers to v4.37.2 ([`69c59b8`](https://github.com/voicepaw/so-vits-svc-fork/commit/69c59b8180cd489f30b5f13bc037c9928e1e65ba)) ### Documentation - Add onako2 as a contributor for doc ([`b204663`](https://github.com/voicepaw/so-vits-svc-fork/commit/b204663d36a1bf1ed5a2af23866824ed9c5dce43)) - Update readme.md [skip ci] ([`b204663`](https://github.com/voicepaw/so-vits-svc-fork/commit/b204663d36a1bf1ed5a2af23866824ed9c5dce43)) - Update .all-contributorsrc [skip ci] ([`b204663`](https://github.com/voicepaw/so-vits-svc-fork/commit/b204663d36a1bf1ed5a2af23866824ed9c5dce43)) ## v4.1.40 (2024-01-24) ### Bug fixes - Update dependency transformers to v4.37.1 ([`d8be0d0`](https://github.com/voicepaw/so-vits-svc-fork/commit/d8be0d01361a00fb71477daab666a75a33d0fd49)) ## v4.1.39 (2024-01-22) ### Bug fixes - Update dependency transformers to v4.37.0 ([`7b405c6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7b405c6daff500c4f60f37cc430cbf364e95bd26)) ## v4.1.38 (2024-01-11) ### Bug fixes - Update dependency fastapi to v0.109.0 ([`565be56`](https://github.com/voicepaw/so-vits-svc-fork/commit/565be56fcc4c62e4f2099db8108bb2c982326411)) ## v4.1.37 (2024-01-03) ### Bug fixes - Update dependency transformers to v4.36.2 ([`7e18425`](https://github.com/voicepaw/so-vits-svc-fork/commit/7e18425b8d1c29820fff30df0bb7c6ee6d24e22d)) ## v4.1.36 (2024-01-03) ### Bug fixes - Update dependency fastapi to v0.108.0 ([`091805c`](https://github.com/voicepaw/so-vits-svc-fork/commit/091805c1d070922318ef10389ab225788db89dd7)) ## v4.1.35 (2024-01-03) ### Bug fixes - Update dependency torch to v2.1.2 ([`77586fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/77586fd8d1eded848cc334aac46be35202da2e0a)) ## v4.1.34 (2024-01-03) ### Bug fixes - Update dependency pebble to v5.0.6 ([`546db40`](https://github.com/voicepaw/so-vits-svc-fork/commit/546db40768114fcfab4a15a8c9b28398a8075446)) ## v4.1.33 (2024-01-02) ### Bug fixes - Update dependency lightning to v2.1.3 ([`47b15e6`](https://github.com/voicepaw/so-vits-svc-fork/commit/47b15e6ba439239ea5459f01321e7a8d2c681ae4)) ## v4.1.32 (2023-11-21) ### Bug fixes - Update dependency pebble to v5.0.4 ([`a8dc5d7`](https://github.com/voicepaw/so-vits-svc-fork/commit/a8dc5d7f88f0117291ba90fce23e3b1eebc52902)) ## v4.1.31 (2023-11-18) ### Bug fixes - Update dependency matplotlib to v3.8.2 ([`68eb536`](https://github.com/voicepaw/so-vits-svc-fork/commit/68eb536b4a45a61803ffbab57a1a5c932b2dedcb)) ## v4.1.30 (2023-11-16) ### Bug fixes - Update dependency torch to v2.1.1 ([`1911035`](https://github.com/voicepaw/so-vits-svc-fork/commit/19110358c12306b087af11837b43baf7d626e500)) ## v4.1.29 (2023-11-16) ### Bug fixes - Update dependency lightning to v2.1.2 ([`58c8d5a`](https://github.com/voicepaw/so-vits-svc-fork/commit/58c8d5aa65dc55b53ed9dce25b7f08280fff5fba)) ## v4.1.28 (2023-11-16) ### Bug fixes - Update dependency rich to v13.7.0 ([`1be5442`](https://github.com/voicepaw/so-vits-svc-fork/commit/1be54422e5383900fac818f7b9d33b31eac4ee92)) ## v4.1.27 (2023-11-15) ### Bug fixes - Update dependency transformers to v4.35.2 ([`77ee0c0`](https://github.com/voicepaw/so-vits-svc-fork/commit/77ee0c0384c02c34c85ec77a8b8e1cfad2f94caf)) ## v4.1.26 (2023-11-14) ### Bug fixes - Update dependency transformers to v4.35.1 ([`fa503ce`](https://github.com/voicepaw/so-vits-svc-fork/commit/fa503ce412d6afcd859375255fb128b33a648465)) ### Documentation - Add shinyjustyz as a contributor for bug, and code ([`acd6a8e`](https://github.com/voicepaw/so-vits-svc-fork/commit/acd6a8e4733ad1dbe94f118599aafa87f23ce89f)) - Update readme.md [skip ci] ([`acd6a8e`](https://github.com/voicepaw/so-vits-svc-fork/commit/acd6a8e4733ad1dbe94f118599aafa87f23ce89f)) - Update .all-contributorsrc [skip ci] ([`acd6a8e`](https://github.com/voicepaw/so-vits-svc-fork/commit/acd6a8e4733ad1dbe94f118599aafa87f23ce89f)) ## v4.1.25 (2023-11-09) ### Bug fixes - Make pyanote.audio use gpu ([`c9d49ca`](https://github.com/voicepaw/so-vits-svc-fork/commit/c9d49ca8a903e1bf6e8a6ac9c6a8365077bedad4)) ## v4.1.24 (2023-11-08) ### Bug fixes - Update dependency lightning to v2.1.1 ([`ce8efce`](https://github.com/voicepaw/so-vits-svc-fork/commit/ce8efcefb8df2601941cae0d63e843e49ffbdfb6)) ## v4.1.23 (2023-11-02) ### Bug fixes - Update dependency transformers to v4.35.0 ([`bb05569`](https://github.com/voicepaw/so-vits-svc-fork/commit/bb055692363677cf48f22baef2b72b255fc74182)) ## v4.1.22 (2023-10-30) ### Bug fixes - Update dependency fastapi to v0.104.1 ([`dbd4490`](https://github.com/voicepaw/so-vits-svc-fork/commit/dbd44909e3aabb2787e136036c1e2ca9ab6b9316)) ## v4.1.21 (2023-10-26) ### Bug fixes - Update dependency onnx to v1.15.0 ([`5736bf7`](https://github.com/voicepaw/so-vits-svc-fork/commit/5736bf7e257dbd39c64ac73f3593ffebaa559def)) ## v4.1.20 (2023-10-26) ### Bug fixes - Update python to >=3.8,<3.13 ([`031712a`](https://github.com/voicepaw/so-vits-svc-fork/commit/031712a70177f20610f8fefd20f49036dfe15721)) ## v4.1.19 (2023-10-21) ### Bug fixes - Update dependency onnxsim to v0.4.35 ([`dd89347`](https://github.com/voicepaw/so-vits-svc-fork/commit/dd89347e863fd7a40683447463dfb665522a1d10)) ## v4.1.18 (2023-10-21) ### Bug fixes - Update dependency onnxsim to v0.4.34 ([`3d2d4af`](https://github.com/voicepaw/so-vits-svc-fork/commit/3d2d4af65221ded497e3e805dfb48792ab20640f)) ## v4.1.17 (2023-10-19) ### Bug fixes - Update dependency transformers to v4.34.1 ([`78c2d4c`](https://github.com/voicepaw/so-vits-svc-fork/commit/78c2d4c850c7cee2e58dc7e0ad10243e55247f64)) ## v4.1.16 (2023-10-18) ### Bug fixes - Update dependency fastapi to v0.104.0 ([`6440667`](https://github.com/voicepaw/so-vits-svc-fork/commit/6440667b03cc79519b9e83aa08757c21d17bcf99)) ## v4.1.15 (2023-10-13) ### Bug fixes - Update dependency rich to v13.6.0 ([`9ae0737`](https://github.com/voicepaw/so-vits-svc-fork/commit/9ae073700058ff17ab5a8a0a781fb3fe942e1994)) ## v4.1.14 (2023-10-13) ### Bug fixes - Update dependency lightning to v2.1.0 ([`4637f69`](https://github.com/voicepaw/so-vits-svc-fork/commit/4637f693ea994c5180ec7a517bea6e5ddd8445aa)) - Update dependency transformers to v4.34.0 ([`6bb2555`](https://github.com/voicepaw/so-vits-svc-fork/commit/6bb2555ace79487a4252a23ba7915a5b3676629e)) ## v4.1.13 (2023-10-13) ### Bug fixes - Update dependency librosa to v0.10.1 ([`3ae20b7`](https://github.com/voicepaw/so-vits-svc-fork/commit/3ae20b7cbcc2fbfc72a2c8cb73a653bb7ee863a1)) - Update dependency torchcrepe to v0.0.22 ([`ad7b2bf`](https://github.com/voicepaw/so-vits-svc-fork/commit/ad7b2bfa23e9e669b46976b796fb58d6b4829ce3)) ## v4.1.12 (2023-10-13) ### Bug fixes - Update dependency fastapi to v0.103.2 ([`02cea64`](https://github.com/voicepaw/so-vits-svc-fork/commit/02cea643631e2c39265c7f4f58e40cea18e707e6)) ## v4.1.11 (2023-09-23) ### Documentation - Replace "34j" with "voicepaw" ([`c1e6c0c`](https://github.com/voicepaw/so-vits-svc-fork/commit/c1e6c0c0c61d4a99eb1a19e8ca0f619d9a07146a)) ### Bug fixes - Update python to >=3.11,<3.12 ([`a5455b9`](https://github.com/voicepaw/so-vits-svc-fork/commit/a5455b92f7228fc01d51cdbfb7da6e9241c7fcca)) ## v4.1.10 (2023-09-17) ### Bug fixes - Update dependency rich to v13.5.3 ([`e692e8c`](https://github.com/voicepaw/so-vits-svc-fork/commit/e692e8cd81dc648edcd60503a52274a8b9738dab)) ## v4.1.9 (2023-09-16) ### Bug fixes - Update dependency transformers to v4.33.2 ([`7a8e54f`](https://github.com/voicepaw/so-vits-svc-fork/commit/7a8e54f10d0679df8419cc1cf934434f9f08e9b9)) ## v4.1.8 (2023-09-15) ### Bug fixes - Update dependency lightning to v2.0.9 ([`dcde3d1`](https://github.com/voicepaw/so-vits-svc-fork/commit/dcde3d1a0b67e4825a709d19f5708b086b6c35e7)) ## v4.1.7 (2023-09-12) ### Bug fixes - Update dependency matplotlib to v3.7.3 ([`302d5a7`](https://github.com/voicepaw/so-vits-svc-fork/commit/302d5a7dd0f0578d9f126c898b1c871f22987742)) ## v4.1.6 (2023-09-06) ### Bug fixes - Update dependency transformers to v4.33.1 ([`f3e3b68`](https://github.com/voicepaw/so-vits-svc-fork/commit/f3e3b689d416f7191b8c5a25976afb0b11b4a3c7)) ## v4.1.5 (2023-09-05) ### Bug fixes - Update dependency transformers to v4.33.0 ([`146d3ae`](https://github.com/voicepaw/so-vits-svc-fork/commit/146d3ae33aeb7b7440b47a89f286ec2dfe4c689f)) ## v4.1.4 (2023-09-02) ### Bug fixes - Update dependency fastapi to v0.103.1 ([`f7473aa`](https://github.com/voicepaw/so-vits-svc-fork/commit/f7473aa1226c8aed89b44f6d08bea05dba68e882)) ## v4.1.3 (2023-08-30) ### Bug fixes - Update dependency lightning to v2.0.8 ([`825fa44`](https://github.com/voicepaw/so-vits-svc-fork/commit/825fa44279bd7c3c2812efafe4f9757803f04519)) ## v4.1.2 (2023-08-28) ### Bug fixes - Update dependency transformers to v4.32.1 ([`da7a72f`](https://github.com/voicepaw/so-vits-svc-fork/commit/da7a72ff0b11231793e48ac5fcb38a1b022fa26b)) ### Documentation - Add instructions for pipx installation, update torch urls ([`0b02c49`](https://github.com/voicepaw/so-vits-svc-fork/commit/0b02c49edb5701becfe141645f0e3fc00c241944)) - Add shenberg as a contributor for usertesting, ideas, and code ([`319ddf3`](https://github.com/voicepaw/so-vits-svc-fork/commit/319ddf35e2f7e915bbf786fa785ec2734f4b0c00)) ## v4.1.1 (2023-07-02) ### Bug fixes - Remove weight norm on inference so metal backend will work without cpu fallback ([`39ea0bc`](https://github.com/voicepaw/so-vits-svc-fork/commit/39ea0bc57f39fdbbcf07c92fab310474d95d1d39)) ## v4.1.0 (2023-06-25) ### Documentation - Add zerui18 as a contributor for code, and ideas ([`4e74fc4`](https://github.com/voicepaw/so-vits-svc-fork/commit/4e74fc4f2f9165a48d75565ae5d0910b6b77dbaf)) - Add ph0rk0z as a contributor for bug, and code ([`8dc25c7`](https://github.com/voicepaw/so-vits-svc-fork/commit/8dc25c793a8a92985ac589b31cc863768a9ba6a7)) ### Features - Add batched loading to clustering & max length per clip to split ([`4179ec9`](https://github.com/voicepaw/so-vits-svc-fork/commit/4179ec9e1d1ac20cffc9e66f522b5f865828f7fe)) ## v4.0.3 (2023-06-25) ### Documentation - Add star3lord as a contributor for bug, and code ([`b3e2cfe`](https://github.com/voicepaw/so-vits-svc-fork/commit/b3e2cfe1294e7b64f76cd34c5b527a080ede2e87)) ### Bug fixes - Pass str instead of path in sf.load() and sf.write() ([`561cbfe`](https://github.com/voicepaw/so-vits-svc-fork/commit/561cbfe64927371ea68c0be70b4bc5007f6514b4)) ## v4.0.2 (2023-06-14) ### Bug fixes - Fix typo in core.py ([`6a87d32`](https://github.com/voicepaw/so-vits-svc-fork/commit/6a87d323ec7716f09062e4846c31e58758a27e33)) ## v4.0.1 (2023-05-29) ### Bug fixes - Fix window scaling ([`9cd720c`](https://github.com/voicepaw/so-vits-svc-fork/commit/9cd720c60d7baa6a945610f674820e14c4833917)) ## v4.0.0 (2023-05-29) ### Features - Update pretrained model url, raise error if there are no files to preprocess, shuffle files consistently ([`c4c719c`](https://github.com/voicepaw/so-vits-svc-fork/commit/c4c719cdddd0e8f7703a02474208451729ab6d18)) - Update urls for pretrained models ([`c4c719c`](https://github.com/voicepaw/so-vits-svc-fork/commit/c4c719cdddd0e8f7703a02474208451729ab6d18)) ## v3.15.0 (2023-05-22) ### Features - Add gui command for module root entrypoint ([`3940a4c`](https://github.com/voicepaw/so-vits-svc-fork/commit/3940a4c0f51943dc3caec0832850f110b0f27961)) - Add gui command to __main__ ([`3940a4c`](https://github.com/voicepaw/so-vits-svc-fork/commit/3940a4c0f51943dc3caec0832850f110b0f27961)) - Add gui command to __main__ ([`3940a4c`](https://github.com/voicepaw/so-vits-svc-fork/commit/3940a4c0f51943dc3caec0832850f110b0f27961)) - Add gui cli command ([`3940a4c`](https://github.com/voicepaw/so-vits-svc-fork/commit/3940a4c0f51943dc3caec0832850f110b0f27961)) ## v3.14.1 (2023-05-07) ### Bug fixes - Replace pyinputplus with normal input ([`2b507da`](https://github.com/voicepaw/so-vits-svc-fork/commit/2b507da7da68f6baf00e5b0437d2d08e2d4f1246)) ## v3.14.0 (2023-05-06) ### Features - Add batch inference, enhance gui, add custom theme ([`3ce110b`](https://github.com/voicepaw/so-vits-svc-fork/commit/3ce110be72aa2c614f24249ee26f00cba03f16a8)) ## v3.13.3 (2023-05-06) ### Documentation - Add meldoner as a contributor for ideas, and code ([`880fea8`](https://github.com/voicepaw/so-vits-svc-fork/commit/880fea84696938b6636332d8c5d88664adae4004)) ### Bug fixes - Complete removal of ckpts in colab ([`e8964c6`](https://github.com/voicepaw/so-vits-svc-fork/commit/e8964c604bba31a9a8fa0a27bb5ea72a49a5fa5b)) ## v3.13.2 (2023-05-06) ### Bug fixes - Always refresh output path if input path changed ([`f79de0c`](https://github.com/voicepaw/so-vits-svc-fork/commit/f79de0c81b6e748f8aa87ab94895c738f1808fcf)) ### Documentation - Fix minor issues in readme.md ([`139ed18`](https://github.com/voicepaw/so-vits-svc-fork/commit/139ed182a39a779d8cbdcefc8022a0ed7ff604cd)) - Add notes about minimum requirements ([`ae9aece`](https://github.com/voicepaw/so-vits-svc-fork/commit/ae9aece9529145ed76aec24febdc77c07522a110)) ## v3.13.1 (2023-05-04) ### Bug fixes - Remove filehandler to avoid permissionerror ([`38e0c4e`](https://github.com/voicepaw/so-vits-svc-fork/commit/38e0c4ed471c4520571a1585d868e325ea1a57e3)) ## v3.13.0 (2023-05-04) ### Documentation - Add maximxlss as a contributor for code ([`435ca3c`](https://github.com/voicepaw/so-vits-svc-fork/commit/435ca3c58ab48934622c3d192cc11fd130a4a6f7)) ### Features - Add max_chunk_seconds option ([`101b948`](https://github.com/voicepaw/so-vits-svc-fork/commit/101b9484a86cce634a71054e5b8110998566197b)) ## v3.12.1 (2023-04-30) ### Documentation - Add scorpi as a contributor for code ([`542d3a8`](https://github.com/voicepaw/so-vits-svc-fork/commit/542d3a8382d97064f13c1dcc4ba11107614dec3f)) ### Bug fixes - Fix epoch variable name to log in checkpoint save/load functions ([`0530ea3`](https://github.com/voicepaw/so-vits-svc-fork/commit/0530ea34fa42d9af51c73872b02d6453427c5a00)) ## v3.12.0 (2023-04-30) ### Features - Add pre-classify command to manually classify files ([`7a0319c`](https://github.com/voicepaw/so-vits-svc-fork/commit/7a0319c65f42b0cc54d1d86ae5945d4a356b507a)) ## v3.11.2 (2023-04-30) ### Bug fixes - Decouple lf0 predictor from speaker embeddings ([`7ab47f4`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ab47f44e2ec77aa8c9e36b2e322d2dca0f94fb0)) ## v3.11.1 (2023-04-30) ### Documentation - Add highupech as a contributor for bug ([`8eedc24`](https://github.com/voicepaw/so-vits-svc-fork/commit/8eedc2439b6987f70c94033c3f375ea330498a64)) - Fix typo in readme.md ([`1773940`](https://github.com/voicepaw/so-vits-svc-fork/commit/1773940ae4a17a522ebc9fe6c1c70c3e02728341)) - Add acekagami as a contributor for translation ([`958b9fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/958b9fdf5fd1d527b63ac488ad21db2ce90539aa)) - Update readme.md [skip ci] ([`958b9fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/958b9fdf5fd1d527b63ac488ad21db2ce90539aa)) - Update .all-contributorsrc [skip ci] ([`958b9fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/958b9fdf5fd1d527b63ac488ad21db2ce90539aa)) - Update readme_zh_cn.md ([`1ccd594`](https://github.com/voicepaw/so-vits-svc-fork/commit/1ccd5941e5f17a273dad681301a287aafb7973d9)) ### Bug fixes - Specify encoding to utf-8 in read_text() and write_text() ([`e947336`](https://github.com/voicepaw/so-vits-svc-fork/commit/e94733678955430f4e0c8ee5a26627077c0ffad9)) ## v3.11.0 (2023-04-23) ### Documentation - Add alexanderkoumis as a contributor for code ([`5e032a3`](https://github.com/voicepaw/so-vits-svc-fork/commit/5e032a3e1eb36b0a7f99fd440b3e2a82f2345747)) - Update readme.md [skip ci] ([`5e032a3`](https://github.com/voicepaw/so-vits-svc-fork/commit/5e032a3e1eb36b0a7f99fd440b3e2a82f2345747)) - Update .all-contributorsrc [skip ci] ([`5e032a3`](https://github.com/voicepaw/so-vits-svc-fork/commit/5e032a3e1eb36b0a7f99fd440b3e2a82f2345747)) ### Features - Configurable output file (#452) ([`d2e3596`](https://github.com/voicepaw/so-vits-svc-fork/commit/d2e3596d5c0874918712488765e068f4010d62b9)) ## v3.10.5 (2023-04-22) ### Bug fixes - Fix so-vits-svc style contentvec usage ([`6d35139`](https://github.com/voicepaw/so-vits-svc-fork/commit/6d351390354b17a2cd004bc9572d7dc1202f236c)) ## v3.10.4 (2023-04-21) ### Bug fixes - Only save checkpoints on main device ([`1aaaac6`](https://github.com/voicepaw/so-vits-svc-fork/commit/1aaaac6328476249371799b92ced3edcbaac8d18)) ### Documentation - Add sbersier as a contributor for bug ([`58b936d`](https://github.com/voicepaw/so-vits-svc-fork/commit/58b936d669fbf5156f1ae1381393762994dd7414)) - Add escoolioinglesias as a contributor for video ([`69f097f`](https://github.com/voicepaw/so-vits-svc-fork/commit/69f097f388447d64b7807cf554a5c310c34b7ef0)) - Add garrettconway as a contributor for review ([`c1e4ada`](https://github.com/voicepaw/so-vits-svc-fork/commit/c1e4ada97739bf0b360295335475fef7029fbe49)) - Add blueamulet as a contributor for maintenance ([`514ed84`](https://github.com/voicepaw/so-vits-svc-fork/commit/514ed84ffda901243c1bd6f39677eb020257f11f)) - Add guranon as a contributor for bug, ideas, and code ([`b9eb3fd`](https://github.com/voicepaw/so-vits-svc-fork/commit/b9eb3fdc350588b9528a74d5b7be8e80b2bfbd51)) - Add zerohackz as a contributor for bug, and code ([`66d5adc`](https://github.com/voicepaw/so-vits-svc-fork/commit/66d5adcf6dbb60fd6b6800162e3e16570a8dac1c)) - Add tybantarnusa as a contributor for bug ([`e6e57b3`](https://github.com/voicepaw/so-vits-svc-fork/commit/e6e57b3e0d97ac91cadde45d5f080ced873df959)) - Add blacksingh as a contributor for bug ([`7bc76ba`](https://github.com/voicepaw/so-vits-svc-fork/commit/7bc76ba9355089ab94fce9231f5dbbdd54e849ee)) - Add escoolioinglesias as a contributor for bug, and usertesting ([`f00fe6e`](https://github.com/voicepaw/so-vits-svc-fork/commit/f00fe6e15cd12085cd01ae3c2676c195e7924429)) - Add outhipped as a contributor for bug ([`7497175`](https://github.com/voicepaw/so-vits-svc-fork/commit/74971752821a852154bbfc35c318bb05e7b1169c)) - Add yxlllc as a contributor for ideas, and code ([`42e35d2`](https://github.com/voicepaw/so-vits-svc-fork/commit/42e35d2a1f83be25e3fb0318e694163b0e936c59)) - Add lordmau5 as a contributor for ideas, maintenance, and 2 more ([`352451c`](https://github.com/voicepaw/so-vits-svc-fork/commit/352451ccc9c1e1f800dc7697d5c705c0b9707c96)) - Add tonyco82 as a contributor for bug ([`036ce90`](https://github.com/voicepaw/so-vits-svc-fork/commit/036ce9052f145cf047434d472f775b563e503946)) - Add 75aosu as a contributor for bug ([`5afc28b`](https://github.com/voicepaw/so-vits-svc-fork/commit/5afc28bf918e1a62343f445a72487c1d932dc7b4)) - Add hxl9654 as a contributor for bug ([`0953f1f`](https://github.com/voicepaw/so-vits-svc-fork/commit/0953f1fd0dfbfa557f639eb8d917805f8891d7b0)) - Add ducttapegames as a contributor for bug ([`b0f4d39`](https://github.com/voicepaw/so-vits-svc-fork/commit/b0f4d39371ed2913ad792a46754469eb68c8c72d)) - Add likkkez as a contributor for bug ([`4a12109`](https://github.com/voicepaw/so-vits-svc-fork/commit/4a12109b6a0b3cd2741f10d6e9027204603b0f27)) - Add alondan as a contributor for bug ([`662ec4b`](https://github.com/voicepaw/so-vits-svc-fork/commit/662ec4b39816b1a1311d56e3edaca31fb442bb8d)) - Add mmodeusher as a contributor for bug ([`6a78df9`](https://github.com/voicepaw/so-vits-svc-fork/commit/6a78df97d8191b62a04c9ec48b74cf1f00e47c30)) - Add meldoner as a contributor for bug ([`5586bec`](https://github.com/voicepaw/so-vits-svc-fork/commit/5586becd35b456523cec1e1aa8c601cd1039dd1c)) ## v3.10.3 (2023-04-19) ### Bug fixes - Don't save model when tuning for auto batch size ([`2311a35`](https://github.com/voicepaw/so-vits-svc-fork/commit/2311a35c36315123c87b7f20dde3c4dda723bea3)) ## v3.10.2 (2023-04-19) ### Bug fixes - Properly stop training after `epochs` has been reached ([`f9bb3d8`](https://github.com/voicepaw/so-vits-svc-fork/commit/f9bb3d86605321288f11387bc853143378c3284e)) ## v3.10.1 (2023-04-19) ### Bug fixes - Support ddp in windows (gloo backend) ([`bcb0507`](https://github.com/voicepaw/so-vits-svc-fork/commit/bcb05078d8ca7a6ac681de919552b3a190b2cd9b)) ## v3.10.0 (2023-04-18) ### Features - Replace `fairseq` with `transformers` ([`a2fe0f3`](https://github.com/voicepaw/so-vits-svc-fork/commit/a2fe0f376d33f02987c91a57bd90a794de90a0e1)) ## v3.9.5 (2023-04-18) ### Bug fixes - Set persistent_workers = true in dataloader for performance, do not save checkpoints, fix logging issue and multiple warning issues, do not do validation when global_step == 0 ([`6cab9af`](https://github.com/voicepaw/so-vits-svc-fork/commit/6cab9af86e3a96e79243fa890eb1c6c51fae4476)) ## v3.9.4 (2023-04-18) ### Bug fixes - Always use "spawn" context in processpool ([`5d7fb77`](https://github.com/voicepaw/so-vits-svc-fork/commit/5d7fb774e8d5e97a9a31dbc891892e9f934f3884)) ## v3.9.3 (2023-04-16) ### Bug fixes - Fix subprocess errors in linux and fix wrong error logging ([`fd67db6`](https://github.com/voicepaw/so-vits-svc-fork/commit/fd67db6312944557c09afd7b1ccbb97987a03489)) ## v3.9.2 (2023-04-16) ### Bug fixes - Fix y_mel length ([`2d71992`](https://github.com/voicepaw/so-vits-svc-fork/commit/2d71992d80ba4142d2d5a5df17c69c2f2ac553fd)) ## v3.9.1 (2023-04-16) ### Bug fixes - Allow higher segment size ([`09d5a52`](https://github.com/voicepaw/so-vits-svc-fork/commit/09d5a52b9bfc8eba8857f2b6c804ecdb39b4b38b)) - Do not use weights_only in get_cluster_model() ([`24c05d1`](https://github.com/voicepaw/so-vits-svc-fork/commit/24c05d16c3b55f664699400496a7e0fd2fd84353)) ## v3.9.0 (2023-04-16) ### Features - Add option to name ckpts by epochs ([`bba24c4`](https://github.com/voicepaw/so-vits-svc-fork/commit/bba24c4a62b935ed29572aa2c2c437d1b54aa2e2)) ## v3.8.1 (2023-04-16) ### Bug fixes - Patch stft and add mps to get_optimal_device() ([`da928aa`](https://github.com/voicepaw/so-vits-svc-fork/commit/da928aa0bb1399bf5780526f8a7e9b674476a000)) ## v3.8.0 (2023-04-15) ### Features - Automatically decide batch_size ([`8ffa128`](https://github.com/voicepaw/so-vits-svc-fork/commit/8ffa128aa209787fde8fb1f0e4ae5c96dfe31217)) ## v3.7.3 (2023-04-15) ### Bug fixes - Show errors raised in inference ([`99833c5`](https://github.com/voicepaw/so-vits-svc-fork/commit/99833c55045647b9a766042765b454cb3d7d18ce)) ## v3.7.2 (2023-04-15) ### Bug fixes - Suppress pytorch logs for deprecated typedstorage ([`e67ac62`](https://github.com/voicepaw/so-vits-svc-fork/commit/e67ac621296cf6667d05b51f23ce8cb9ef8a0855)) ## v3.7.1 (2023-04-15) ### Bug fixes - Fix check for notebook / colab ([`7f69814`](https://github.com/voicepaw/so-vits-svc-fork/commit/7f698141e1b65e901579a5dbbabf28bfae5cc91f)) ## v3.7.0 (2023-04-14) ### Features - Add option to specify tensorboardlogger version parameter support ([`a685123`](https://github.com/voicepaw/so-vits-svc-fork/commit/a685123a4063e08e0b021a1ad51098d3154b75de)) ## v3.6.2 (2023-04-14) ### Bug fixes - Fix torch.load and save to use file objects and weights_only and remove unidecode ([`4aad701`](https://github.com/voicepaw/so-vits-svc-fork/commit/4aad701badc1eae5195e874dec40f9ed8dd40ee6)) ## v3.6.1 (2023-04-14) ### Bug fixes - Fix gradient logging ([`73ef3dc`](https://github.com/voicepaw/so-vits-svc-fork/commit/73ef3dc94ccd4c0514ab33b0c5a65edf8b356484)) ## v3.6.0 (2023-04-13) ### Features - Support sola algorithm ([`0fcbf99`](https://github.com/voicepaw/so-vits-svc-fork/commit/0fcbf9979862e945ca2427612a92549db2d627d0)) ## v3.5.1 (2023-04-13) ### Bug fixes - Do not use rich in notebook ([`03c8240`](https://github.com/voicepaw/so-vits-svc-fork/commit/03c824015872e3d7e4e5795b9d65fad4116d54e4)) ## v3.5.0 (2023-04-13) ### Features - Run inference in thread and disable button ([`c55caa8`](https://github.com/voicepaw/so-vits-svc-fork/commit/c55caa8019cc06fc6bd8851b0fd895b73cf926a4)) ## v3.4.0 (2023-04-13) ### Features - Make num_workers configurable ([`e8df714`](https://github.com/voicepaw/so-vits-svc-fork/commit/e8df7146b0d1d3ee32af576c251f47d8fdd80bb3)) ## v3.3.1 (2023-04-13) ### Performance improvements - Specify num_workers in dataloader ([`6042164`](https://github.com/voicepaw/so-vits-svc-fork/commit/6042164a60f9990eb0636e37dd650bb0cdff032b)) ## v3.3.0 (2023-04-13) ### Features - Use richprogressbar ([`17e937a`](https://github.com/voicepaw/so-vits-svc-fork/commit/17e937aae9c90b513e4b7674f442a60161c84e83)) ## v3.2.0 (2023-04-13) ### Features - Add optional `accumulate_grad_batches` config param ([`1172b23`](https://github.com/voicepaw/so-vits-svc-fork/commit/1172b2385cfe5239da3222cf93916436395e0f1a)) - Add accumulate_grad_batches hparam ([`1172b23`](https://github.com/voicepaw/so-vits-svc-fork/commit/1172b2385cfe5239da3222cf93916436395e0f1a)) ### Bug fixes - Normalize loss when using gradient accumulation ([`1172b23`](https://github.com/voicepaw/so-vits-svc-fork/commit/1172b2385cfe5239da3222cf93916436395e0f1a)) ## v3.1.13 (2023-04-12) ### Bug fixes - Fix too noisy logger ([`bd0eb33`](https://github.com/voicepaw/so-vits-svc-fork/commit/bd0eb33a66d77afff8328d08008f2643651c712a)) - Fix cli() not called in __main__ ([`11f2d24`](https://github.com/voicepaw/so-vits-svc-fork/commit/11f2d245137da240f5e8214e4b6ce4330d726143)) ## v3.1.12 (2023-04-12) ### Bug fixes - Fix ddp not working ([`bec43fc`](https://github.com/voicepaw/so-vits-svc-fork/commit/bec43fcbedf6b16260411655b19cf780ddbafe8e)) ## v3.1.11 (2023-04-12) ### Bug fixes - Fix init_logger not showing debug messages in certain conditions as intended ([`d3ab7d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3ab7d667c391ba1d8d1b34e2b66992256b3989d)) ## v3.1.10 (2023-04-11) ### Bug fixes - Improves inference ([`d3228df`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3228df704b2e6a0746b3f842ca5f2240890d829)) - Improves and nb_clean ([`d3228df`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3228df704b2e6a0746b3f842ca5f2240890d829)) - Improves inference ([`d3228df`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3228df704b2e6a0746b3f842ca5f2240890d829)) - Improves inference ([`d3228df`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3228df704b2e6a0746b3f842ca5f2240890d829)) - Unix formatting ([`d3228df`](https://github.com/voicepaw/so-vits-svc-fork/commit/d3228df704b2e6a0746b3f842ca5f2240890d829)) - Step lr schedulers at end of epoch ([`3af223e`](https://github.com/voicepaw/so-vits-svc-fork/commit/3af223eeb5146abcbb8198d4c11e2c1895ece130)) ## v3.1.9 (2023-04-10) ### Bug fixes - Fix fp16_run not being mix precision and fix bf16 errors ([`b0dd0ed`](https://github.com/voicepaw/so-vits-svc-fork/commit/b0dd0ed4014d32e9f19e335ec603bdab92c52039)) ## v3.1.8 (2023-04-10) ### Bug fixes - Fix wrong commands in "before training" ([`e056ad9`](https://github.com/voicepaw/so-vits-svc-fork/commit/e056ad9ec22cbaa119f7c93cb60b5b8851e80a7e)) ## v3.1.7 (2023-04-09) ### Bug fixes - Improve quality of training ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) - Initialize `_temp_epoch` variable ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) - Fix order of optimizer as per lightning.ai documentation ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) - Remove `with torch.no_grad():` call for generator loss ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) - Ensure `log_audio_dict` uses correct `total_batch_idx` ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) - Only save checkpoints for first `batch_idx` ([`7ed71d6`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ed71d6fd89ca8bf2c4aefbb280e705b1d7ae6b8)) ## v3.1.6 (2023-04-09) ### Bug fixes - Fix checkpoint not properly loaded ([`0979147`](https://github.com/voicepaw/so-vits-svc-fork/commit/0979147a234e08999a19dba4988a53886f61dade)) ## v3.1.5 (2023-04-09) ### Bug fixes - Fix optim_d functions called in wrong order ([`13d6346`](https://github.com/voicepaw/so-vits-svc-fork/commit/13d63469b0a84ace0dc8848df47dc20538b98770)) ## v3.1.4 (2023-04-09) ### Bug fixes - Add bf16 and fp16 support ([`4229fd8`](https://github.com/voicepaw/so-vits-svc-fork/commit/4229fd8ead64cf03caad9acd3d8f7f0fec3a7fee)) ## v3.1.3 (2023-04-09) ### Bug fixes - Update dependency starlette to v0.26.1 ([`5eb574b`](https://github.com/voicepaw/so-vits-svc-fork/commit/5eb574bec01430399df48e90e6112cef85e21945)) ## v3.1.2 (2023-04-09) ### Bug fixes - Remove wrong test and trigger release ([`9ea77e4`](https://github.com/voicepaw/so-vits-svc-fork/commit/9ea77e4c5c6575844685998e237994d54be84bb9)) - Remove pydantic constraints ([`f446e3b`](https://github.com/voicepaw/so-vits-svc-fork/commit/f446e3bbd62205b9c847e9ecdc46f519417b572a)) - Fix fastapi version to 0.88 ([`a26f387`](https://github.com/voicepaw/so-vits-svc-fork/commit/a26f387abea585c300cd1ed0c36c6b9afc731764)) - Fix get_optimal_device ([`79e4b5a`](https://github.com/voicepaw/so-vits-svc-fork/commit/79e4b5a0abe20789335eaaf4a359880c099aaa35)) ## v3.1.1 (2023-04-08) ### Bug fixes - Update dependency fastapi to <0.96 ([`29c8cc0`](https://github.com/voicepaw/so-vits-svc-fork/commit/29c8cc05b7e5180058e03f2dc1f681e58cc67f09)) ## v3.1.0 (2023-04-08) ### Features - Migrate to lightning ([`824ecbd`](https://github.com/voicepaw/so-vits-svc-fork/commit/824ecbd7222b9b9ada77c4fbbd7ae7f491049f21)) ## v3.0.5 (2023-04-08) ### Bug fixes - Fix train_cluster ([`b0c93e4`](https://github.com/voicepaw/so-vits-svc-fork/commit/b0c93e49f9cdfdcd714575fc27011bf56ce4493d)) ## v3.0.4 (2023-04-06) ### Bug fixes - Fix default config type to revert breaking changes ([`e05c0b5`](https://github.com/voicepaw/so-vits-svc-fork/commit/e05c0b52b6affac5e4483c0938e04584e1bd8d98)) ## v3.0.3 (2023-04-05) ### Bug fixes - Fix issues when loading legacy checkpoint and fix pre-hubert n_jobs ([`15f1e7f`](https://github.com/voicepaw/so-vits-svc-fork/commit/15f1e7ffca80cb551316affae546ea72e8cccb34)) ## v3.0.2 (2023-04-04) ### Performance improvements - Move methods from dataloader to pre-hubert ([`d5a4456`](https://github.com/voicepaw/so-vits-svc-fork/commit/d5a4456ebd5b6659ca037ee2f43480a00d7915f6)) ## v3.0.1 (2023-04-03) ### Bug fixes - Remove possible leak in unused code ([`e921c3d`](https://github.com/voicepaw/so-vits-svc-fork/commit/e921c3dc018ea783b4c26375a04f499a45ad9df0)) ### Performance improvements - Better implementation of repeat_expand_2d ([`ef30a9d`](https://github.com/voicepaw/so-vits-svc-fork/commit/ef30a9d5ae60fdde5f6b44d6cea8cee0a40dd3e9)) ## v3.0.0 (2023-04-03) ### Features - Add quickvc, fix usage of contentvec, remove onnx support ([`1a6c021`](https://github.com/voicepaw/so-vits-svc-fork/commit/1a6c021cd102b48b44e006decebc165062df8a95)) ### Documentation - Update allcontributors link for @mashirosa ([`650524b`](https://github.com/voicepaw/so-vits-svc-fork/commit/650524bb37997326e924814632c6202b76660f77)) - Add paperspace referral ([`7280012`](https://github.com/voicepaw/so-vits-svc-fork/commit/7280012df66b5ea71291e5a80bb22451f0ca236e)) - Add paperspace link and add more description, add a link for zh-cn docs ([`bc4b122`](https://github.com/voicepaw/so-vits-svc-fork/commit/bc4b1229e4ad9c046fda38334c4c6d22548356c2)) ## v2.1.5 (2023-04-01) ### Bug fixes - Update dependency tensorboard to v2.12.1 ([`0ccda1c`](https://github.com/voicepaw/so-vits-svc-fork/commit/0ccda1ccb34b8125abe369f738b06de7b77c8efc)) ## v2.1.4 (2023-03-31) ### Bug fixes - Update dependency gradio to v3.24.1 ([`4fa141b`](https://github.com/voicepaw/so-vits-svc-fork/commit/4fa141b210cb9b80bc7f75176fb01b18352c91cd)) ## v2.1.3 (2023-03-31) ### Bug fixes - Update dependency gradio to v3.24.0 ([`4e441cb`](https://github.com/voicepaw/so-vits-svc-fork/commit/4e441cb30429e4a47afd261d69e32ec5f86564c9)) ### Documentation - Add sbersier as a contributor for ideas, and usertesting ([`a655bf4`](https://github.com/voicepaw/so-vits-svc-fork/commit/a655bf47dde4ad2506283997987bce3a09229c57)) - Add coldcawfee as a contributor for bug ([`87a09e6`](https://github.com/voicepaw/so-vits-svc-fork/commit/87a09e654a0e8f064293750779b743abf2897ebb)) ## v2.1.2 (2023-03-28) ### Bug fixes - Fix wrong devices set as default ([`6265f8f`](https://github.com/voicepaw/so-vits-svc-fork/commit/6265f8f93e8facd4f58aab906bfcb23e05d4032b)) - Fix -h option overridden ([`52f1cfe`](https://github.com/voicepaw/so-vits-svc-fork/commit/52f1cfe1f08bd63966b0d1d7c025abed17cb36a6)) ### Documentation - Add xieyumc as a contributor for doc ([`29474d9`](https://github.com/voicepaw/so-vits-svc-fork/commit/29474d9dc77555fe5a55427278d44dfea7ece5ef)) - Update readme_zh_cn.md ([`f94a14c`](https://github.com/voicepaw/so-vits-svc-fork/commit/f94a14cb63e2afd40cba3e94f84077643d9a7560)) ## v2.1.1 (2023-03-27) ### Bug fixes - Update dependency rich to v13.3.3 ([`8bdefa9`](https://github.com/voicepaw/so-vits-svc-fork/commit/8bdefa9636e13fb0a24058a589675a20655357f4)) ### Documentation - Add nerdyrodent as a contributor for video ([`78ab661`](https://github.com/voicepaw/so-vits-svc-fork/commit/78ab661af198d87ce2ca5525fa262c639ed03cdc)) - Add heyfixit as a contributor for doc ([`32a2a63`](https://github.com/voicepaw/so-vits-svc-fork/commit/32a2a63b375300be6d67be56035005956003bdfd)) - Add desuka-art as a contributor for bug ([`fe3c6bf`](https://github.com/voicepaw/so-vits-svc-fork/commit/fe3c6bf8270fc219cdaeef05b7deacdbfc4df313)) - Add ruckusmattster as a contributor for bug ([`2b971db`](https://github.com/voicepaw/so-vits-svc-fork/commit/2b971db5c7a332c8321e99bd77bb956a0ee3ec88)) - Add pierluigizagaria as a contributor for usertesting ([`6fabe8d`](https://github.com/voicepaw/so-vits-svc-fork/commit/6fabe8d10b684caa236331a157455db1da686f8f)) - Add satisfy256 as a contributor for bug ([`ee72aee`](https://github.com/voicepaw/so-vits-svc-fork/commit/ee72aee12f23fee458599b8b7fa4f0ed27d33b1c)) - Add dl909 as a contributor for bug ([`a5e6651`](https://github.com/voicepaw/so-vits-svc-fork/commit/a5e6651a8f537961caf53adbb8bc52c1412c0762)) ## v2.1.0 (2023-03-27) ### Features - Add an option to launch tensorboard in `train` command ([`ef22cce`](https://github.com/voicepaw/so-vits-svc-fork/commit/ef22cceaeb7f06ea53b2151ef9c962d1040de20d)) ## v2.0.0 (2023-03-27) ### Bug fixes - Fix preprocessing and convert bool options to flags, use `unidecode` to decode non-ascii filenames in `pre-resample` ([`98d7ee2`](https://github.com/voicepaw/so-vits-svc-fork/commit/98d7ee22a40104468285324cc6ec21c707c30d54)) ### Documentation - Add yt tutorial vid link ([`1694f44`](https://github.com/voicepaw/so-vits-svc-fork/commit/1694f449e5a9f7b9da71e9a4c2764830c5268de3)) ## v1.4.3 (2023-03-26) ### Performance improvements - Specify samplerate to reduce memory usage ([`6217eda`](https://github.com/voicepaw/so-vits-svc-fork/commit/6217eda0ec3bac27e408fcd0466a6b658cf718c5)) ## v1.4.2 (2023-03-26) ### Bug fixes - Initialize logging in logger file and move version log ([`441d51f`](https://github.com/voicepaw/so-vits-svc-fork/commit/441d51f8efa84144d8a9f8fa02f2adaaf15295c0)) - Fix dtype in sf.read() to save memory and fix preprocess_resample ([`0af1e13`](https://github.com/voicepaw/so-vits-svc-fork/commit/0af1e13a468ad282266a595b8d3c77d62aa938dc)) - Fix audio resampled to 22khz ([`4203f37`](https://github.com/voicepaw/so-vits-svc-fork/commit/4203f374c5625369518063888e1ca70d1af4f694)) ### Documentation - Update notebook and readme.md ([`38d9744`](https://github.com/voicepaw/so-vits-svc-fork/commit/38d97449d5b443167926f409f904f4b40c6e0f03)) ## v1.4.1 (2023-03-26) ### Bug fixes - Fix some parameters not passed ([`6cfe3d3`](https://github.com/voicepaw/so-vits-svc-fork/commit/6cfe3d3f567c03e1c59065ff827f564a13a7aaaf)) ## v1.4.0 (2023-03-26) ### Features - Add 2 more preprocessing commands ([`45eba0f`](https://github.com/voicepaw/so-vits-svc-fork/commit/45eba0f25db1346757fcd9134ccb3a62125a05a9)) ### Documentation - Add blueamulet as a contributor for code ([`6a7e8ba`](https://github.com/voicepaw/so-vits-svc-fork/commit/6a7e8ba827ee69f1ceca60b83dfbae437bbe6667)) ## v1.3.5 (2023-03-26) ### Bug fixes - Allow float32 audio to be processed properly ([`13943b6`](https://github.com/voicepaw/so-vits-svc-fork/commit/13943b693d177cf5417127647a3280a9e5ff9ca5)) ## v1.3.4 (2023-03-25) ### Bug fixes - Change default f0 method from crepe to dio ([`baf58d2`](https://github.com/voicepaw/so-vits-svc-fork/commit/baf58d286c286c0064fd015e0e8f0b9e690021f7)) ## v1.3.3 (2023-03-25) ### Documentation - Add lordmau5 as a contributor for bug, and code ([`4df46ee`](https://github.com/voicepaw/so-vits-svc-fork/commit/4df46eed47378f41d76c7637f540779db3bb54a3)) - Update readme.md [skip ci] ([`4df46ee`](https://github.com/voicepaw/so-vits-svc-fork/commit/4df46eed47378f41d76c7637f540779db3bb54a3)) - Update .all-contributorsrc [skip ci] ([`4df46ee`](https://github.com/voicepaw/so-vits-svc-fork/commit/4df46eed47378f41d76c7637f540779db3bb54a3)) ### Bug fixes - Fix old checkpoint deletion by sorting the models properly (#65) ([`287dc94`](https://github.com/voicepaw/so-vits-svc-fork/commit/287dc94be719147023af0ecfe7e92b16a8e98fc5)) ## v1.3.2 (2023-03-24) ### Bug fixes - Fix devices list and fix tqdm error in gui ([`59724cd`](https://github.com/voicepaw/so-vits-svc-fork/commit/59724cd2afc6a8d5ef6ea4b7fa8c012e21fc4af6)) ### Documentation - Add mashirosa as a contributor for doc, and bug ([`495b7cb`](https://github.com/voicepaw/so-vits-svc-fork/commit/495b7cbfc9f9468d49bc3f57efe6c5c076dcb0d3)) - Fix cluster inference command and improve cluster training command ([`7642594`](https://github.com/voicepaw/so-vits-svc-fork/commit/7642594472bd660fe046c45909f0475398af199e)) ## v1.3.1 (2023-03-24) ### Bug fixes - Fix defaut for auto_play ([`07920a4`](https://github.com/voicepaw/so-vits-svc-fork/commit/07920a4954e1a14d47fcb2687f050d49d03da415)) - Fix speaker not automaticlly set to the first one if not found in cluster inference ([`a643e4f`](https://github.com/voicepaw/so-vits-svc-fork/commit/a643e4f26b59f12f00b316467edad876467dad49)) ### Documentation - Add cluster training and inference ([`9ffb621`](https://github.com/voicepaw/so-vits-svc-fork/commit/9ffb6216f418d8c5a4a9f1bdd79fc2cebb885db1)) ## v1.3.0 (2023-03-23) ### Features - Better error handling ([`985704b`](https://github.com/voicepaw/so-vits-svc-fork/commit/985704b1afa8af15fe8eab5e3fc838465f5162c8)) ## v1.2.11 (2023-03-23) ### Bug fixes - Fix onnx export and fix gui ([`3e9a47d`](https://github.com/voicepaw/so-vits-svc-fork/commit/3e9a47dd4faa938a6aaebf2d7c1c0b9d68cc97d3)) ## v1.2.10 (2023-03-23) ### Bug fixes - Fix cluster not working ([`29b209c`](https://github.com/voicepaw/so-vits-svc-fork/commit/29b209cf7060deb7f15ae28fe2e520bb20a236f4)) ## v1.2.9 (2023-03-23) ### Bug fixes - Fix speakers and devices not updated and fix default presets ([`a851150`](https://github.com/voicepaw/so-vits-svc-fork/commit/a8511508b0d2b3a62e7b77833280e4264997d9ed)) ## v1.2.8 (2023-03-22) ### Bug fixes - Update dependency torchcrepe to v0.0.18 ([`4fda479`](https://github.com/voicepaw/so-vits-svc-fork/commit/4fda4799f017e7de57de36c95cd8d64ab6f9b446)) ### Documentation - Shorten docs ([`e0c1572`](https://github.com/voicepaw/so-vits-svc-fork/commit/e0c1572d057032735c3118e9137be8e4399c6251)) ## v1.2.7 (2023-03-22) ### Bug fixes - Fix clean_checkpoints ([`e5169bf`](https://github.com/voicepaw/so-vits-svc-fork/commit/e5169bf8121578a6cc3ed1bccd1b47a6281cafe4)) ## v1.2.6 (2023-03-22) ### Documentation - Add blueamulet as a contributor for question ([`8d073e3`](https://github.com/voicepaw/so-vits-svc-fork/commit/8d073e3e0798a0739cea5b979cf6cfd361f3e6d3)) - Add garrettconway as a contributor for doc ([`6c6cbc6`](https://github.com/voicepaw/so-vits-svc-fork/commit/6c6cbc6ac8a97ecb71d789a5782bb8db2c4c52f8)) - Update readme.md regarding installation, update. wsl audio support ([`4f1323b`](https://github.com/voicepaw/so-vits-svc-fork/commit/4f1323b3d12a080f38a195bf494db7086dbfa7e4)) ### Bug fixes - Disable checkbox if cuda is not available and show errors for vc ([`3fdd983`](https://github.com/voicepaw/so-vits-svc-fork/commit/3fdd9836c3b60d2e737fc7e40efe42a9cc84888e)) ## v1.2.5 (2023-03-22) ### Bug fixes - Fix rtf calculation ([`fb25500`](https://github.com/voicepaw/so-vits-svc-fork/commit/fb25500f4e3e70e5d71462715b83fb3bedcf8bd5)) ## v1.2.4 (2023-03-22) ### Bug fixes - Fix latest_checkpoint_path ([`00b9f4a`](https://github.com/voicepaw/so-vits-svc-fork/commit/00b9f4acd005cdb801b3f41df6e25b0b8799d631)) ## v1.2.3 (2023-03-21) ### Bug fixes - Update dependency onnxsim to v0.4.19 ([`f8a4cf6`](https://github.com/voicepaw/so-vits-svc-fork/commit/f8a4cf61bad5d0d55a7334af8f022114605e7038)) ## v1.2.2 (2023-03-21) ### Bug fixes - Update dependency onnxoptimizer to v0.3.10 ([`d0137f9`](https://github.com/voicepaw/so-vits-svc-fork/commit/d0137f920083a08173d58e35492b9b9fb925e41f)) ### Documentation - Add links for pretrained models and fix gui pic height ([`34ac39f`](https://github.com/voicepaw/so-vits-svc-fork/commit/34ac39f0c9ce89f2effdd18f3fc4ab91e72b3f82)) - Add more explanation to notebook ([`9b3c483`](https://github.com/voicepaw/so-vits-svc-fork/commit/9b3c4835e063d26d1e66d172cf592e69e30d59b8)) ## v1.2.1 (2023-03-21) ### Bug fixes - Use librosa.load() instead of soundfile.read() ([`b343106`](https://github.com/voicepaw/so-vits-svc-fork/commit/b34310662b2bac53884df396932f72366132ea01)) - Fix window too big to show in a fhd environment ([`259e6e6`](https://github.com/voicepaw/so-vits-svc-fork/commit/259e6e6eb6ebfd9027b1813756d67d1a516e0214)) ## v1.2.0 (2023-03-21) ### Features - Add presets ([`e8adcc6`](https://github.com/voicepaw/so-vits-svc-fork/commit/e8adcc621f6caf5f4b20846575b3559c032ed47f)) ## v1.1.1 (2023-03-21) ### Bug fixes - Update dependency gradio to v3.23.0 ([`a2bdb48`](https://github.com/voicepaw/so-vits-svc-fork/commit/a2bdb48b436d206b30bb72409852c0b30d6811e9)) ## v1.1.0 (2023-03-21) ### Documentation - Update gui screenshot ([`58d06aa`](https://github.com/voicepaw/so-vits-svc-fork/commit/58d06aa7460dd75ef793da295bf7651ae9940814)) ### Features - Enhance realtimevc ([`81551ce`](https://github.com/voicepaw/so-vits-svc-fork/commit/81551ce9c6fb7924d184c3c5a4cf9035168b28d2)) ## v1.0.2 (2023-03-21) ### Bug fixes - Update dependency scipy to v1.10.1 ([`e0253bf`](https://github.com/voicepaw/so-vits-svc-fork/commit/e0253bf1e655f86be605395a18f343763d975101)) ## v1.0.1 (2023-03-20) ### Documentation - Add throwawayaccount01 as a contributor for bug ([`15e31fa`](https://github.com/voicepaw/so-vits-svc-fork/commit/15e31fa806249d45235918fa62a48a86c43538cb)) - Add blueamulet as a contributor for ideas ([`a3bcb2b`](https://github.com/voicepaw/so-vits-svc-fork/commit/a3bcb2be2992c98bcc2485082c19009c74cb3194)) ### Performance improvements - Do dummy inference before running vc ([`4066c43`](https://github.com/voicepaw/so-vits-svc-fork/commit/4066c4334b107062d2daa7c9dc00600a56c6e553)) ## v1.0.0 (2023-03-20) ### Bug fixes - Fix default dataset path ([`ac47fed`](https://github.com/voicepaw/so-vits-svc-fork/commit/ac47fede2581d375c2be9c28102961f19f5a9aa1)) ## v0.8.2 (2023-03-20) ### Bug fixes - Fix compute_f0_crepe returning wrong length ([`afb42b0`](https://github.com/voicepaw/so-vits-svc-fork/commit/afb42b019ccd133876a2c55cf01007950a733d8c)) ## v0.8.1 (2023-03-20) ### Bug fixes - Update dependency librosa to v0.10.0 ([`8e92f71`](https://github.com/voicepaw/so-vits-svc-fork/commit/8e92f71b2820628f0f8583e6bc455d8f753f4302)) ## v0.8.0 (2023-03-20) ### Features - Add more f0 calculation methods ([`6b3b20d`](https://github.com/voicepaw/so-vits-svc-fork/commit/6b3b20dfd609d81cb1184b7c8e8865a58f8d45f9)) ## v0.7.1 (2023-03-20) ### Bug fixes - Update dependency gradio to v3.22.1 ([`f09fc23`](https://github.com/voicepaw/so-vits-svc-fork/commit/f09fc23ca82519cc095509d4d4760561424a17ec)) ### Features - Allow nested dataset ([`0433151`](https://github.com/voicepaw/so-vits-svc-fork/commit/0433151d94c4da8e84a0183bdd47f1e08ea3c462)) ## v0.6.3 (2023-03-20) ### Bug fixes - Update dependency torch to v1.13.1 ([`8826d68`](https://github.com/voicepaw/so-vits-svc-fork/commit/8826d6870e223e7969baa069bf12235e0deec0b7)) - Update dependency torchaudio to v0.13.1 ([`989f5d9`](https://github.com/voicepaw/so-vits-svc-fork/commit/989f5d903b47ba9b0ea1d0fe37cbfe76edf0a811)) ### Documentation - Update notes about vram caps ([`0a245f4`](https://github.com/voicepaw/so-vits-svc-fork/commit/0a245f4ee69bd0d4371836367becf0fe409431e2)) ## v0.6.2 (2023-03-19) ### Documentation - Add garrettconway as a contributor for bug ([`31d9671`](https://github.com/voicepaw/so-vits-svc-fork/commit/31d9671207143fd06b8db148802d1e27874151ce)) - Launch tensorboard ([`52229ba`](https://github.com/voicepaw/so-vits-svc-fork/commit/52229ba0fe9458e37b45287c0a716c7cd36adbd6)) - Add 34j as a contributor for example, infra, and 6 more ([`1b90378`](https://github.com/voicepaw/so-vits-svc-fork/commit/1b903783b4b89f2f5a4fc2e1b47f3eade0c0402f)) - Add garrettconway as a contributor for code ([`716813f`](https://github.com/voicepaw/so-vits-svc-fork/commit/716813fbff85ab4609d8ec3f374b78c6551877e5)) ### Bug fixes - Use hubert preprocess force_rebuild argument ([`87cf807`](https://github.com/voicepaw/so-vits-svc-fork/commit/87cf807496248e2c7b859069f81aa040e86aec59)) ## v0.6.1 (2023-03-19) ### Performance improvements - Better performance ([`668c8e1`](https://github.com/voicepaw/so-vits-svc-fork/commit/668c8e1f18cefb0ebd2fb2f1d6572ce4d37d1102)) ## v0.6.0 (2023-03-18) ### Features - Configurable input and output devices ([`a822a60`](https://github.com/voicepaw/so-vits-svc-fork/commit/a822a6098d322ff37725eee19d17758f72a6db49)) ### Documentation - Fix notebook ([`427b4c1`](https://github.com/voicepaw/so-vits-svc-fork/commit/427b4c1c6e0482345b17fedb018f7a18db68ccc5)) - Update notebook ([`ae3e471`](https://github.com/voicepaw/so-vits-svc-fork/commit/ae3e4710aac41555f00ddcdfbcf5a5e925afb718)) ## v0.5.0 (2023-03-18) ### Features - Remember last directory (misc) ([`92558da`](https://github.com/voicepaw/so-vits-svc-fork/commit/92558da2f0e4eb24a8de412fb7e22dc3530b648a)) - Show defaults ([`3d298df`](https://github.com/voicepaw/so-vits-svc-fork/commit/3d298df91bdfca230959603da74331b5eef4d487)) ### Bug fixes - Fix option names ([`7ff34fe`](https://github.com/voicepaw/so-vits-svc-fork/commit/7ff34fe623dde6b0a684c45cf33dc54118f9a800)) ### Documentation - Update readme.md ([`b988101`](https://github.com/voicepaw/so-vits-svc-fork/commit/b98810194703b6bb0ede03a00c460eeecdab5131)) ## v0.4.1 (2023-03-18) ### Bug fixes - Call init_logger() ([`e6378f1`](https://github.com/voicepaw/so-vits-svc-fork/commit/e6378f12e747e618ff90ece1552d09c0d0714d41)) ## v0.4.0 (2023-03-18) ### Features - Enhance realtime algorythm ([`d789a12`](https://github.com/voicepaw/so-vits-svc-fork/commit/d789a12308784473ae5d09e0b73fa15bf7554de1)) ## v0.3.0 (2023-03-17) ### Features - Add gui ([`34aec2b`](https://github.com/voicepaw/so-vits-svc-fork/commit/34aec2b98ee4ef82ef488129b61a7952af5226a3)) ### Documentation - Update notebook ([`7b74606`](https://github.com/voicepaw/so-vits-svc-fork/commit/7b74606508cfb7e45224cbd76f3de9c43c8b4309)) ## v0.2.1 (2023-03-17) ### Bug fixes - Fix notebook ([`3ed00cc`](https://github.com/voicepaw/so-vits-svc-fork/commit/3ed00cc66d4f66e045f61fc14937cb9160eee556)) ## v0.2.0 (2023-03-17) ### Features - Realtime inference ([`4dea1ae`](https://github.com/voicepaw/so-vits-svc-fork/commit/4dea1ae51fe2e47a3f41556bdbe3fefd033d729a)) ## v0.1.0 (2023-03-17) ### Features - Main feat ([`faa990c`](https://github.com/voicepaw/so-vits-svc-fork/commit/faa990ce6411d8b4e8b3d2d48c4b532b76ff7800)) ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Contributions are welcome, and they are greatly appreciated! Every little helps, and credit will always be given. You can contribute in many ways: ## Types of Contributions ### Report Bugs Report bugs to [our issue page][gh-issues]. If you are reporting a bug, please include: - Your operating system name and version. - Any details about your local setup that might be helpful in troubleshooting. - Detailed steps to reproduce the bug. ### Fix Bugs Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it. ### Implement Features Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it. ### Write Documentation SoftVC VITS Singing Voice Conversion Fork could always use more documentation, whether as part of the official SoftVC VITS Singing Voice Conversion Fork docs, in docstrings, or even on the web in blog posts, articles, and such. ### Submit Feedback The best way to send feedback [our issue page][gh-issues] on GitHub. If you are proposing a feature: - Explain in detail how it would work. - Keep the scope as narrow as possible, to make it easier to implement. - Remember that this is a volunteer-driven project, and that contributions are welcome 😊 ## Get Started! Ready to contribute? Here's how to set yourself up for local development. 1. Fork the repo on GitHub. 2. Clone your fork locally: ```shell $ git clone git@github.com:your_name_here/so-vits-svc-fork.git ``` 3. Install the project dependencies with [uv](https://docs.astral.sh/uv/): ```shell $ uv sync ``` 4. Create a branch for local development: ```shell $ git checkout -b name-of-your-bugfix-or-feature ``` Now you can make your changes locally. 5. When you're done making changes, check that your changes pass our tests: ```shell $ uv run pytest ``` 6. Linting is done through [pre-commit](https://pre-commit.com). Provided you have the tool installed globally, you can run them all as one-off: ```shell $ pre-commit run -a ``` Or better, install the hooks once and have them run automatically each time you commit: ```shell $ pre-commit install ``` 7. Commit your changes and push your branch to GitHub: ```shell $ git add . $ git commit -m "feat(something): your detailed description of your changes" $ git push origin name-of-your-bugfix-or-feature ``` Note: the commit message should follow [the conventional commits](https://www.conventionalcommits.org). We run [`commitlint` on CI](https://github.com/marketplace/actions/commit-linter) to validate it, and if you've installed pre-commit hooks at the previous step, the message will be checked at commit time. 8. Submit a pull request through the GitHub website or using the GitHub CLI (if you have it installed): ```shell $ gh pr create --fill ``` ## Pull Request Guidelines We like to have the pull request open as soon as possible, that's a great place to discuss any piece of work, even unfinished. You can use draft pull request if it's still a work in progress. Here are a few guidelines to follow: 1. Include tests for feature or bug fixes. 2. Update the documentation for significant features. 3. Ensure tests are passing on CI. ## Tips To run a subset of tests: ```shell $ pytest tests ``` ## Making a new release The deployment should be automated and can be triggered from the Semantic Release workflow in GitHub. The next version will be based on [the commit logs](https://python-semantic-release.readthedocs.io/en/latest/commit-log-parsing.html#commit-log-parsing). This is done by [python-semantic-release](https://python-semantic-release.readthedocs.io/en/latest/index.html) via a GitHub action. [gh-issues]: https://github.com/voicepaw/so-vits-svc-fork/issues ================================================ FILE: Dockerfile ================================================ FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime@sha256:82e0d379a5dedd6303c89eda57bcc434c40be11f249ddfadfd5673b84351e806 RUN ["apt", "update"] RUN ["apt", "install", "-y", "build-essential"] RUN ["pip", "install", "-U", "pip", "setuptools", "wheel"] RUN ["pip", "install", "-U", "so-vits-svc-fork"] ENTRYPOINT [ "svcg" ] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 34j and contributors Copyright (c) 2021 Jingyi Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # SoftVC VITS Singing Voice Conversion Fork [简体中文](README_zh_CN.md)

CI Status Documentation Status Test coverage percentage

uv Ruff pre-commit

PyPI Version Supported Python versions License

A fork of [`so-vits-svc`](https://github.com/svc-develop-team/so-vits-svc) with **realtime support** and **greatly improved interface**. Based on branch `4.0` (v1) (or `4.1`) and the models are compatible. `4.1` models are not supported. Other models are also not supported. ## No Longer Maintained ### Reasons - Within a year, the technology has evolved enormously and there are many better alternatives - Was hoping to create a more Modular, easy-to-install repository, but didn't have the skills, time, money to do so - PySimpleGUI is no longer LGPL - Using Typer is getting more popular than directly using Click ### Alternatives Always beware of the very few influencers who are **quite overly surprised** about any new project/technology. You need to take every social networking post with semi-doubt. The voice changer boom that occurred in 2023 has come to an end, and many developers, not just those in this repository, have been not very active for a while. There are too many alternatives to list here but: - RVC family: [IAHispano/Applio](https://github.com/IAHispano/Applio) (MIT) (actively maintained), [fumiama's RVC](https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI) (AGPL) and [original RVC](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) (MIT) (no longer maintained) - [VCClient](https://github.com/w-okada/voice-changer) (MIT etc.) offers web-based GUI for real-time conversion but not quite actively maintained. - [fish-diffusion](https://github.com/fishaudio/fish-diffusion/commits/main/) tried to be quite modular but not actively maintained. - [yxlllc/DDSP\-SVC](https://github.com/yxlllc/DDSP-SVC) - new releases are issued occasionally. [yxlllc/ReFlow\-VAE\-SVC](https://github.com/yxlllc/ReFlow-VAE-SVC) - [coqui\-ai/TTS](https://github.com/coqui-ai/TTS) was for TTS but was partially modular. However, it is not maintained anymore, unfortunately. Elsewhere, several start-ups have improved and marketed voice changers (probably for profit). > Updates to this repository have been limited to maintenance since Spring 2023. > ~~It is difficult to narrow the list of alternatives here, but please consider trying other projects if you are looking for a voice changer with even better performance (especially in terms of latency other than quality).~~ > ~~However, this project may be ideal for those who want to try out voice conversion for the moment (because it is easy to install).~~ ## Features not available in the original repo - **Realtime voice conversion** (enhanced in v1.1.0) - Partially integrates [`QuickVC`](https://github.com/quickvc/QuickVC-VoiceConversion) - Fixed misuse of [`ContentVec`](https://github.com/auspicious3000/contentvec) in the original repository.[^c] - More accurate pitch estimation using [`CREPE`](https://github.com/marl/crepe/). - GUI and unified CLI available - ~2x faster training - Ready to use just by installing with `pip`. - Automatically download pretrained models. No need to install `fairseq`. - Code completely formatted with black, isort, autoflake etc. [^c]: [#206](https://github.com/voicepaw/so-vits-svc-fork/issues/206) ## Installation ### Option 1. One click easy installation Download .bat This BAT file will automatically perform the steps described below. ### Option 2. Manual installation (using pipx, experimental) #### 1. Installing pipx Windows (development version required due to [pypa/pipx#940](https://github.com/pypa/pipx/issues/940)): ```shell py -3 -m pip install --user git+https://github.com/pypa/pipx.git py -3 -m pipx ensurepath ``` Linux/MacOS: ```shell python -m pip install --user pipx python -m pipx ensurepath ``` #### 2. Installing so-vits-svc-fork ```shell pipx install so-vits-svc-fork --python=3.11 pipx inject so-vits-svc-fork torch torchaudio --pip-args="--upgrade" --index-url=https://download.pytorch.org/whl/cu121 # https://download.pytorch.org/whl/nightly/cu121 ``` ### Option 3. Manual installation
Creating a virtual environment Windows: ```shell py -3.11 -m venv venv venv\Scripts\activate ``` Linux/MacOS: ```shell python3.11 -m venv venv source venv/bin/activate ``` Anaconda: ```shell conda create -n so-vits-svc-fork python=3.11 pip conda activate so-vits-svc-fork ``` Installing without creating a virtual environment may cause a `PermissionError` if Python is installed in Program Files, etc.
Install this via pip (or your favourite package manager that uses pip): ```shell python -m pip install -U pip setuptools wheel pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu121 # https://download.pytorch.org/whl/nightly/cu121 pip install -U so-vits-svc-fork ```
Notes - If no GPU is available or using MacOS, simply remove `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu121`. MPS is probably supported. - If you are using an AMD GPU on Linux, replace `--index-url https://download.pytorch.org/whl/cu121` with `--index-url https://download.pytorch.org/whl/nightly/rocm5.7`. AMD GPUs are not supported on Windows ([#120](https://github.com/voicepaw/so-vits-svc-fork/issues/120)).
### Update Please update this package regularly to get the latest features and bug fixes. ```shell pip install -U so-vits-svc-fork # pipx upgrade so-vits-svc-fork ``` ## Usage ### Inference #### GUI ![GUI](https://raw.githubusercontent.com/voicepaw/so-vits-svc-fork/main/docs/_static/gui.png) GUI launches with the following command: ```shell svcg ``` #### CLI - Realtime (from microphone) ```shell svc vc ``` - File ```shell svc infer source.wav ``` Pretrained models are available on [Hugging Face](https://huggingface.co/models?search=so-vits-svc) or [CIVITAI](https://civitai.com/tag/so-vits-svc-fork). #### Notes - If using WSL, please note that WSL requires additional setup to handle audio and the GUI will not work without finding an audio device. - In real-time inference, if there is noise on the inputs, the HuBERT model will react to those as well. Consider using realtime noise reduction applications such as [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) in this case. - Models other than for 4.0v1 or this repository are not supported. - GPU inference requires at least 4 GB of VRAM. If it does not work, try CPU inference as it is fast enough. [^r-inference] [^r-inference]: [#469](https://github.com/voicepaw/so-vits-svc-fork/issues/469) ### Training #### Before training - If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1] - If your dataset is a long audio file with a single speaker, use `svc pre-split` to split the dataset into multiple files (using `librosa`). - If your dataset is a long audio file with multiple speakers, use `svc pre-sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`. - To manually classify audio files, `svc pre-classify` is available. Up and down arrow keys can be used to change the playback speed. [^1]: https://ytpmv.info/how-to-use-uvr/ #### Cloud [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/voicepaw/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb) [![Open In Paperspace](https://img.shields.io/badge/Open%20in-Paperspace-blue?style=flat-square&logo=paperspace)](https://console.paperspace.com/github/voicepaw/so-vits-svc-fork-paperspace/blob/main/so-vits-svc-fork-4.0-paperspace.ipynb) [![Paperspace Referral]()](https://www.paperspace.com/?r=9VJN74I)[^p] If you do not have access to a GPU with more than 10 GB of VRAM, the free plan of Google Colab is recommended for light users and the Pro/Growth plan of Paperspace is recommended for heavy users. Conversely, if you have access to a high-end GPU, the use of cloud services is not recommended. [^p]: If you register a referral code and then add a payment method, you may save about $5 on your first month's monthly billing. Note that both referral rewards are Paperspace credits and not cash. It was a tough decision but inserted because debugging and training the initial model requires a large amount of computing power and the developer is a student. #### Local Place your dataset like `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` (subfolders and non-ASCII filenames are acceptable) and run: ```shell svc pre-resample svc pre-config svc pre-hubert svc train -t ``` #### Notes - Dataset audio duration per file should be <~ 10s. - Need at least 4GB of VRAM. [^r-training] - It is recommended to increase the `batch_size` as much as possible in `config.json` before the `train` command to match the VRAM capacity. Setting `batch_size` to `auto-{init_batch_size}-{max_n_trials}` (or simply `auto`) will automatically increase `batch_size` until OOM error occurs, but may not be useful in some cases. - To use `CREPE`, replace `svc pre-hubert` with `svc pre-hubert -fm crepe`. - To use `ContentVec` correctly, replace `svc pre-config` with `-t so-vits-svc-4.0v1`. Training may take slightly longer because some weights are reset due to reusing legacy initial generator weights. - To use `MS-iSTFT Decoder`, replace `svc pre-config` with `svc pre-config -t quickvc`. - Silence removal and volume normalization are automatically performed (as in the upstream repo) and are not required. - If you have trained on a large, copyright-free dataset, consider releasing it as an initial model. - For further details (e.g. parameters, etc.), you can see the [Wiki](https://github.com/voicepaw/so-vits-svc-fork/wiki) or [Discussions](https://github.com/voicepaw/so-vits-svc-fork/discussions). [^r-training]: [#456](https://github.com/voicepaw/so-vits-svc-fork/issues/456) ### Further help For more details, run `svc -h` or `svc -h`. ```shell > svc -h Usage: svc [OPTIONS] COMMAND [ARGS]... so-vits-svc allows any folder structure for training data. However, the following folder structure is recommended. When training: dataset_raw/{speaker_name}/**/{wav_name}.{any_format} When inference: configs/44k/config.json, logs/44k/G_XXXX.pth If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc. (The latest model will be automatically loaded.) To train a model, run pre-resample, pre-config, pre-hubert, train. To infer a model, run infer. Options: -h, --help Show this message and exit. Commands: clean Clean up files, only useful if you are using the default file structure infer Inference onnx Export model to onnx (currently not working) pre-classify Classify multiple audio files into multiple files pre-config Preprocessing part 2: config pre-hubert Preprocessing part 3: hubert If the HuBERT model is not found, it will be... pre-resample Preprocessing part 1: resample pre-sd Speech diarization using pyannote.audio pre-split Split audio files into multiple files train Train model If D_0.pth or G_0.pth not found, automatically download from hub. train-cluster Train k-means clustering vc Realtime inference from microphone ``` #### External Links [Video Tutorial](https://www.youtube.com/watch?v=tZn0lcGO5OQ) ## Contributors ✨ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
34j
34j

💻 🤔 📖 💡 🚇 🚧 👀 ⚠️ 📣 🐛
GarrettConway
GarrettConway

💻 🐛 📖 👀
BlueAmulet
BlueAmulet

🤔 💬 💻 🚧
ThrowawayAccount01
ThrowawayAccount01

🐛
緋

📖 🐛
Lordmau5
Lordmau5

🐛 💻 🤔 🚧 💬 📓
DL909
DL909

🐛
Satisfy256
Satisfy256

🐛
Pierluigi Zagaria
Pierluigi Zagaria

📓
ruckusmattster
ruckusmattster

🐛
Desuka-art
Desuka-art

🐛
heyfixit
heyfixit

📖
Nerdy Rodent
Nerdy Rodent

📹
谢宇
谢宇

📖
ColdCawfee
ColdCawfee

🐛
sbersier
sbersier

🤔 📓 🐛
Meldoner
Meldoner

🐛 🤔 💻
mmodeusher
mmodeusher

🐛
AlonDan
AlonDan

🐛
Likkkez
Likkkez

🐛
Duct Tape Games
Duct Tape Games

🐛
Xianglong He
Xianglong He

🐛
75aosu
75aosu

🐛
tonyco82
tonyco82

🐛
yxlllc
yxlllc

🤔 💻
outhipped
outhipped

🐛
escoolioinglesias
escoolioinglesias

🐛 📓 📹
Blacksingh
Blacksingh

🐛
Mgs. M. Thoyib Antarnusa
Mgs. M. Thoyib Antarnusa

🐛
Exosfeer
Exosfeer

🐛 💻
guranon
guranon

🐛 🤔 💻
Alexander Koumis
Alexander Koumis

💻
acekagami
acekagami

🌍
Highupech
Highupech

🐛
Scorpi
Scorpi

💻
Maximxls
Maximxls

💻
Star3Lord
Star3Lord

🐛 💻
Forkoz
Forkoz

🐛 💻
Zerui Chen
Zerui Chen

💻 🤔
Roee Shenberg
Roee Shenberg

📓 🤔 💻
Justas
Justas

🐛 💻
Onako2
Onako2

📖
4ll0w3v1l
4ll0w3v1l

💻
j5y0V6b
j5y0V6b

🛡️
marcellocirelli
marcellocirelli

🐛
Priyanshu Patel
Priyanshu Patel

💻
Anna Gorshunova
Anna Gorshunova

🐛 💻
This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! ## Credits [![Copier](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/copier-org/copier/master/img/badge/badge-grayscale-inverted-border-orange.json)](https://github.com/copier-org/copier) This package was created with [Copier](https://copier.readthedocs.io/) and the [browniebroke/pypackage-template](https://github.com/browniebroke/pypackage-template) project template. ================================================ FILE: README_zh_CN.md ================================================ # SoftVC VITS Singing Voice Conversion

CI Status Documentation Status Test coverage percentage

Poetry black pre-commit

PyPI Version Supported Python versions License

基于 [`so-vits-svc4.0(V1)`](https://github.com/svc-develop-team/so-vits-svc)的一个分支,支持实时推理和图形化推理界面,且兼容其模型。 ## 新功能 - **实时语音转换** (增强版本 v1.1.0) - 与[`QuickVC`](https://github.com/quickvc/QuickVC-VoiceConversion)相结合 - 修复了原始版本中对 [`ContentVec`](https://github.com/auspicious3000/contentvec) 的误用[^c] - 使用 CREPE 进行更准确的音高推测 - 图形化界面和统一命令行界面 - 相比之前双倍的训练速度 - 只需使用 `pip` 安装即可使用,不需要安装 `fairseq` - 自动下载预训练模型和 HuBERT 模型 - 使用 black、isort、autoflake 等完全格式化的代码 [^c]: [#206](https://github.com/34j/so-vits-svc-fork/issues/206) ## 安装教程 ### 可以使用 bat 一键安装 Download .bat ### 本 bat 汉化基于英文版,对原版进行了一些本地工作和优化,如安装过程有问题,可以尝试安装原版 Download .bat ### 手动安装
创建一个虚拟环境 Windows: ```shell py -3.10 -m venv venv venv\Scripts\activate ``` Linux/MacOS: ```shell python3.10 -m venv venv source venv/bin/activate ``` Anaconda: ```shell conda create -n so-vits-svc-fork python=3.10 pip conda activate so-vits-svc-fork ``` 如果 Python 安装在 Program Files,在安装时未创造虚拟环境可能会导致`PermissionError`
### 安装 通过 pip 安装 (或者通过包管理器使用 pip 安装): ```shell python -m pip install -U pip setuptools wheel pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118 pip install -U so-vits-svc-fork ``` - 如果没有可用 GPU 或使用 MacOS, 不需要执行 `pip install -U torch torchaudio --index-url https://download.pytorch.org/whl/cu118`. MPS 可能已经安装了. - 如果在 Linux 下使用 AMD GPU, 请使用此命令 `--index-url https://download.pytorch.org/whl/rocm5.4.2` 替换掉 `--index-url https://download.pytorch.org/whl/cu118` . Windows 下不支持 AMD GPUs (#120). ### 更新 请经常更新以获取最新功能和修复错误: ```shell pip install -U so-vits-svc-fork ``` ## 使用教程 ### 推理 #### 图形化界面 ![GUI](https://raw.githubusercontent.com/34j/so-vits-svc-fork/main/docs/_static/gui.png) 请使用以下命令运行图形化界面: ```shell svcg ``` #### 命令行界面 - 实时转换 (输入源为麦克风) ```shell svc vc ``` - 从文件转换 ```shell svc infer source.wav ``` [预训练模型](https://huggingface.co/models?search=so-vits-svc-4.0) 可以在 HuggingFace 获得。 #### 注意 - 如果使用 WSL, 请注意 WSL 需要额外设置来处理音频,如果 GUI 找不到音频设备将不能正常工作。 - 在实时语音转换中, 如果输入源有杂音, HuBERT 模型依然会把杂音进行推理.可以考虑使用实时噪音减弱程序比如 [RTX Voice](https://www.nvidia.com/en-us/geforce/guides/nvidia-rtx-voice-setup-guide/) 来解决. ### 训练 #### 预处理 - 如果数据集有 BGM,请用例如[Ultimate Vocal Remover](https://ultimatevocalremover.com/)等软件去除 BGM. 推荐使用`3_HP-Vocal-UVR.pth` 或者 `UVR-MDX-NET Main` . [^1] - 如果数据集是包含单个歌手的长音频文件, 使用 `svc pre-split` 将数据集拆分为多个文件 (使用 `librosa`). - 如果数据集是包含多个歌手的长音频文件, 使用 `svc pre-sd` 将数据集拆分为多个文件 (使用 `pyannote.audio`) 。为了提高准确率,可能需要手动进行分类。如果歌手的声线多样,请把 --min-speakers 设置为大于实际说话者数量. 如果出现依赖未安装, 请通过 `pip install pyannote-audio`来安装 `pyannote.audio`。 [^1]: https://ytpmv.info/how-to-use-uvr/ #### 云端 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/34j/so-vits-svc-fork/blob/main/notebooks/so-vits-svc-fork-4.0.ipynb) [![Open In Paperspace](https://img.shields.io/badge/Open%20in-Paperspace-blue?style=flat-square&logo=paperspace)](https://console.paperspace.com/github/34j/so-vits-svc-fork-paperspace/blob/main/so-vits-svc-fork-4.0-paperspace.ipynb) [![Paperspace Referral]()](https://www.paperspace.com/?r=9VJN74I)[^p] 如果你无法获取 10GB 显存以上的显卡,对于轻量用户,推荐使用 Google Colab 的免费方案;而重度用户,则建议使用 Paperspace 的 Pro/Growth Plan。当然,如果你有高端的显卡,就没必要使用云服务了。 [^p]: If you register a referral code and then add a payment method, you may save about $5 on your first month's monthly billing. Note that both referral rewards are Paperspace credits and not cash. It was a tough decision but inserted because debugging and training the initial model requires a large amount of computing power and the developer is a student. #### 本地 将数据集处理成 `dataset_raw/{speaker_id}/**/{wav_file}.{any_format}` 的格式(可以使用子文件夹和非 ASCII 文件名)然后运行: ```shell svc pre-resample svc pre-config svc pre-hubert svc train -t ``` #### 注意 - 数据集的每个文件应该小于 10s,不然显存会爆。 - 建议在执行 `train` 命令之前提高 `config.json` 中的 `batch_size` 以匹配显存容量。 将`batch_size`设为`auto-{init_batch_size}-{max_n_trials}`(或者只需设为`auto`)就会自动提高`batch_size`,直到爆显存为止(不过自动调高 batch_size 有概率失效) - 如果想要 f0 的推理方式为 `CREPE`, 用 `svc pre-hubert -fm crepe` 替换 `svc pre-hubert`. - 若想正确使用`ContentVec`,用 `-t so-vits-svc-4.0v1`替换`svc pre-config`。由于复用 generator weights,一些 weights 会被重置而导致训练时间稍微延长. - 若要使用`MS-iSTFT Decoder`,用 `svc pre-config -t quickvc`替换 `svc pre-config`. - 在原始仓库中,会自动移除静音和进行音量平衡,且这个操作并不是必须要处理的。 - 倘若你已经大规模训练了一个免费公开版权的数据集,可以考虑将其作为底模发布。 - 对于更多细节(比如参数等),详见[Wiki](https://github.com/34j/so-vits-svc-fork/wiki) 或 [Discussions](https://github.com/34j/so-vits-svc-fork/discussions). ### 帮助 更多命令, 运行 `svc -h` 或者 `svc -h` ```shell > svc -h 用法: svc [OPTIONS] COMMAND [ARGS]... so-vits-svc 允许任何文件夹结构用于训练数据 但是, 建议使用以下文件夹结构 训练: dataset_raw/{speaker_name}/**/{wav_name}.{any_format} 推理: configs/44k/config.json, logs/44k/G_XXXX.pth 如果遵循文件夹结构,则无需指定模型路径,配置路径等,将自动加载最新模型 若要要训练模型, 运行 pre-resample, pre-config, pre-hubert, train. 若要要推理模型, 运行 infer. 可选: -h, --help 显示信息并退出 命令: clean 清理文件,仅在使用默认文件结构时有用 infer 推理 onnx 导出模型到onnx pre-config 预处理第 2 部分: config pre-hubert 预处理第 3 部分: 如果没有找到 HuBERT 模型,则会... pre-resample 预处理第 1 部分: resample pre-sd Speech diarization 使用 pyannote.audio pre-split 将音频文件拆分为多个文件 train 训练模型 如果 D_0.pth 或 G_0.pth 没有找到,自动从集线器下载. train-cluster 训练 k-means 聚类模型 vc 麦克风实时推理 ``` #### 补充链接 [视频教程](https://www.youtube.com/watch?v=tZn0lcGO5OQ) ## Contributors ✨ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
34j
34j

💻 🤔 📖 💡 🚇 🚧 👀 ⚠️ 📣 🐛
GarrettConway
GarrettConway

💻 🐛 📖 👀
BlueAmulet
BlueAmulet

🤔 💬 💻 🚧
ThrowawayAccount01
ThrowawayAccount01

🐛
緋

📖 🐛
Lordmau5
Lordmau5

🐛 💻 🤔 🚧 💬 📓
DL909
DL909

🐛
Satisfy256
Satisfy256

🐛
Pierluigi Zagaria
Pierluigi Zagaria

📓
ruckusmattster
ruckusmattster

🐛
Desuka-art
Desuka-art

🐛
heyfixit
heyfixit

📖
Nerdy Rodent
Nerdy Rodent

📹
谢宇
谢宇

📖
ColdCawfee
ColdCawfee

🐛
sbersier
sbersier

🤔 📓 🐛
Meldoner
Meldoner

🐛
mmodeusher
mmodeusher

🐛
AlonDan
AlonDan

🐛
Likkkez
Likkkez

🐛
Duct Tape Games
Duct Tape Games

🐛
Xianglong He
Xianglong He

🐛
75aosu
75aosu

🐛
tonyco82
tonyco82

🐛
yxlllc
yxlllc

🤔 💻
outhipped
outhipped

🐛
escoolioinglesias
escoolioinglesias

🐛 📓 📹
Blacksingh
Blacksingh

🐛
Mgs. M. Thoyib Antarnusa
Mgs. M. Thoyib Antarnusa

🐛
Exosfeer
Exosfeer

🐛 💻
guranon
guranon

🐛 🤔 💻
Alexander Koumis
Alexander Koumis

💻
This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! ================================================ FILE: commitlint.config.js ================================================ module.exports = { extends: ["@commitlint/config-conventional"], rules: { "header-max-length": [0, "always", Infinity], "body-max-line-length": [0, "always", Infinity], "footer-max-line-length": [0, "always", Infinity], }, }; ================================================ FILE: commitlint.config.mjs ================================================ export default { extends: ["@commitlint/config-conventional"], rules: { "header-max-length": [0, "always", Infinity], "body-max-line-length": [0, "always", Infinity], "footer-max-line-length": [0, "always", Infinity], }, }; ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build .PHONY: help livehtml Makefile # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Build, watch and serve docs with live reload livehtml: sphinx-autobuild -b html -c . $(SOURCEDIR) $(BUILDDIR)/html # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/_static/.gitkeep ================================================ ================================================ FILE: docs/changelog.md ================================================ (changelog)= ```{include} ../CHANGELOG.md ``` ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html from pathlib import Path from typing import Any from sphinx.application import Sphinx from sphinx.ext import apidoc # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "SoftVC VITS Singing Voice Conversion Fork" copyright = "2023, 34j" author = "34j" release = "4.2.30" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "myst_parser", "sphinx.ext.napoleon", "sphinx.ext.autodoc", "sphinx.ext.viewcode", ] napoleon_google_docstring = False # The suffix of source filenames. source_suffix = [ ".rst", ".md", ] # Add any paths that contain templates here, relative to this directory. templates_path = [ "_templates", ] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [ "_build", "Thumbs.db", ".DS_Store", ] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "furo" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] # -- Automatically run sphinx-apidoc ----------------------------------------- def run_apidoc(_: Any) -> None: """Run sphinx-apidoc.""" docs_path = Path(__file__).parent module_path = docs_path.parent / "src" / "so_vits_svc_fork" apidoc.main( [ "--force", "--module-first", "-o", docs_path.as_posix(), module_path.as_posix(), ] ) def setup(app: Sphinx) -> None: """Setup sphinx.""" app.connect("builder-inited", run_apidoc) ================================================ FILE: docs/contributing.md ================================================ (contributing)= ```{include} ../CONTRIBUTING.md ``` ================================================ FILE: docs/index.md ================================================ # Welcome to SoftVC VITS Singing Voice Conversion Fork documentation! ```{toctree} :caption: Installation & Usage :maxdepth: 2 installation usage ``` ```{toctree} :caption: Project Info :maxdepth: 2 changelog contributing ``` ```{toctree} :caption: API Reference :maxdepth: 2 so_vits_svc_fork ``` ```{include} ../README.md ``` ================================================ FILE: docs/installation.md ================================================ (installation)= # Installation The package is published on [PyPI](https://pypi.org/project/so-vits-svc-fork/) and can be installed with `pip` (or any equivalent): ```bash pip install so-vits-svc-fork ``` Next, see the {ref}`section about usage ` to see how to use it. ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.https://www.sphinx-doc.org/ exit /b 1 ) if "%1" == "" goto help %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/usage.md ================================================ (usage)= # Usage Assuming that you've followed the {ref}`installations steps `, you're now ready to use this package. Start by importing it: ```python import so_vits_svc_fork ``` TODO: Document usage ================================================ FILE: easy-installation/install-cn.bat ================================================ @echo off echo batӢİ棬ԭһЩعŻ簲װ⣬Գ԰װԭ echo. echo. echo Python 汾 3.10... echo. py -3.10 --version >nul 2>&1 if %errorlevel%==0 ( echo Python 3.10 Ѿװ echo. ) else ( echo Python 3.10 δװʼ... echo. curl https://www.python.org/ftp/python/3.10.10/python-3.10.10-amd64.exe -o python-3.10.10-amd64.exe echo װ Python 3.10... echo. python-3.10.10-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 echo װ... echo. del python-3.10.10-amd64.exe ) echo. echo GPU... echo. nvidia-smi >nul 2>&1 if %errorlevel%==0 ( echo ҵGPU echo. ) else ( echo δҵfound echo. ) nvidia-smi >nul 2>&1 if %errorlevel%==0 ( echo. echo CUDA... echo. if %errorlevel%==0 ( echo CUDA Ѿװ echo. ) else ( echo δ⵽CUDAֶװCUDAװб echo https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows echo. echo ѾȷװCUDAdz԰ǿƼִУرձ򣬰װCUDA echo. Pause ) echo cuDNN... if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\cudnn64_8.dll" ( echo cuDNN Ѿװ echo. ) else ( echo δ⵽cuDNNֶװCUDAװб echo https://developer.nvidia.com/cudnn (https://developer.nvidia.com/downloads/compute/cudnn/secure/8.8.1/local_installers/11.8/cudnn-windows-x86_64-8.8.1.3_cuda11-archive.zip/) echo. echo ѾȷװcuDNNdz԰ǿƼִУرձ򣬰װCUDA echo. Pause ) ) echo. echo ڴ⻷Ҫһʱ䣬ĵȴ... echo. py -3.10 -m venv venv echo. echo pip wheel... echo. venv\Scripts\python.exe -m pip install --upgrade pip wheel echo. nvidia-smi >nul 2>&1 if %errorlevel%==0 ( echo װ PyTorch GPU汾... echo. venv\Scripts\pip.exe install torch torchvision torchaudio --index-url https://mirror.sjtu.edu.cn/pytorch-wheels echo װ PyTorch CPU汾... echo. venv\Scripts\pip.exe install torch torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple pyspider ) echo. echo ϰǷ񶼳ɹװȷɹװ󣬰ʼװso-vits-svc-fork echo. Pause echo װ so-vits-svc-fork... echo. venv\Scripts\pip.exe install so-vits-svc-fork echo. echo so-vits-svc-fork ͼλ... echo. venv\Scripts\svcg.exe Pause ================================================ FILE: easy-installation/install.bat ================================================ @echo off echo You can rerun this script to update the installation. echo Moving to AppData\Roaming\so-vits-svc-fork... mkdir "%APPDATA%\so-vits-svc-fork" >nul 2>&1 cd "%APPDATA%\so-vits-svc-fork" echo Checking for Python 3.10... py -3.10 --version >nul 2>&1 if %errorlevel%==0 ( echo Python 3.10 is already installed. ) else ( echo Python 3.10 is not installed. Downloading installer... curl https://www.python.org/ftp/python/3.10.10/python-3.10.10-amd64.exe -o python-3.10.10-amd64.exe echo Installing Python 3.10... python-3.10.10-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 echo Cleaning up installer... del python-3.10.10-amd64.exe ) echo Creating virtual environment... py -3.10 -m venv venv echo Updating pip and wheel... venv\Scripts\python.exe -m pip install --upgrade pip wheel nvidia-smi >nul 2>&1 if %errorlevel%==0 ( echo Installing PyTorch with GPU support... venv\Scripts\pip.exe install torch torchaudio --index-url https://download.pytorch.org/whl/cu118 ) else ( echo Installing PyTorch without GPU support... venv\Scripts\pip.exe install torch torchaudio ) echo Installing so-vits-svc-fork... venv\Scripts\pip.exe install so-vits-svc-fork rem echo Creating shortcut... rem powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%USDRPROFILE%\Desktop\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()" echo Creating shortcut to the start menu... powershell "$s=(New-Object -COM WScript.Shell).CreateShortcut('%APPDATA%\Microsoft\Windows\Start Menu\Programs\so-vits-svc-fork.lnk');$s.TargetPath='%APPDATA%\so-vits-svc-fork\venv\Scripts\svcg.exe';$s.Save()" echo Launching so-vits-svc-fork GUI... venv\Scripts\svcg.exe ================================================ FILE: flake.nix ================================================ { description = "A flake providing a dev shell for Numba with CUDA without installing Numba via nix. Also supports PyTorch yet being minimal for Numba with CUDA."; inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; }; outputs = { self, nixpkgs }: let system = "x86_64-linux"; # Adjust if needed pkgs = import nixpkgs { system = system; config.allowUnfree = true; }; cudatookit-with-cudart-to-lib64 = pkgs.symlinkJoin { name = "cudatoolkit"; paths = with pkgs.cudaPackages; [ cudatoolkit (pkgs.lib.getStatic cuda_cudart) ]; postBuild = '' ln -s $out/lib $out/lib64 ''; }; in { devShells.${system}.default = pkgs.mkShell { shellHook = '' # Required for both PyTorch and Numba to find CUDA export CUDA_PATH=${cudatookit-with-cudart-to-lib64} # Required for both PyTorch and Numba, adds necessary paths for dynamic linking export LD_LIBRARY_PATH=${ pkgs.lib.makeLibraryPath [ "/run/opengl-driver" # Needed to find libGL.so, required by both PyTorch and Numba ] }:$LD_LIBRARY_PATH export LIBRARY_PATH=${ pkgs.lib.makeLibraryPath [ pkgs.graphviz ] }:$LIBRARY_PATH export C_INCLUDE_PATH=${ pkgs.lib.makeIncludePath [ pkgs.graphviz ] }:$C_INCLUDE_PATH ''; }; }; } ================================================ FILE: notebooks/so-vits-svc-fork-4.0.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Before training\n", "\n", "This program saves the last 3 generations of models to Google Drive. Since 1 generation of models is >1GB, you should have at least 3GB of free space in Google Drive. If you do not have such free space, it is recommended to create another Google Account." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Installation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Check GPU\n", "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Mount Google Drive\n", "from google.colab import drive\n", "\n", "drive.mount(\"/content/drive\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Install dependencies\n", "# @markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.\n", "!python -m pip install -U pip wheel\n", "%pip install -U ipython\n", "\n", "# @markdown Branch (for development)\n", "BRANCH = \"none\" # @param {\"type\": \"string\"}\n", "if BRANCH == \"none\":\n", " %pip install -U so-vits-svc-fork\n", "else:\n", " %pip install -U git+https://github.com/34j/so-vits-svc-fork.git@{BRANCH}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Make dataset directory\n", "!mkdir -p \"dataset_raw\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Copy your dataset\n", "# @markdown **We assume that your dataset is in your Google Drive's `so-vits-svc-fork/dataset/(speaker_name)` directory.**\n", "DATASET_NAME = \"kiritan\" # @param {type: \"string\"}\n", "!cp -R /content/drive/MyDrive/so-vits-svc-fork/dataset/{DATASET_NAME}/ -t \"dataset_raw/\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Download dataset (Tsukuyomi-chan JVS)\n", "# @markdown You can download this dataset if you don't have your own dataset.\n", "# @markdown Make sure you agree to the license when using this dataset.\n", "# @markdown https://tyc.rei-yumesaki.net/material/corpus/#toc6\n", "# !wget -N https://tyc.rei-yumesaki.net/files/voice/tyc-corpus1.zip\n", "# !unzip -O sjis tyc-corpus1.zip\n", "# !mv \"/content/つくよみちゃんコーパス Vol.1 声優統計コーパス(JVSコーパス準拠)/おまけ:WAV(+12dB増幅&高音域削減)/WAV(+12dB増幅&高音域削減)\" \"dataset_raw/tsukuyomi\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Automatic preprocessing\n", "!svc pre-resample" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!svc pre-config" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "F0_METHOD = \"dio\" # @param [\"crepe\", \"crepe-tiny\", \"parselmouth\", \"dio\", \"harvest\"]\n", "!svc pre-hubert -fm {F0_METHOD}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Train\n", "%load_ext tensorboard\n", "%tensorboard --logdir drive/MyDrive/so-vits-svc-fork/logs/44k" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!svc train --model-path drive/MyDrive/so-vits-svc-fork/logs/44k" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training Cluster model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!svc train-cluster --output-path drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Inference" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Get the author's voice as a source\n", "import random\n", "\n", "NAME = str(random.randint(1, 49))\n", "TYPE = \"fsd50k\" # @param [\"\", \"digit\", \"dog\", \"fsd50k\"]\n", "CUSTOM_FILEPATH = \"\" # @param {type: \"string\"}\n", "if CUSTOM_FILEPATH != \"\":\n", " NAME = CUSTOM_FILEPATH\n", "else:\n", " # it is extremely difficult to find a voice that can download from the internet directly\n", " if TYPE == \"dog\":\n", " !wget -N f\"https://huggingface.co/datasets/437aewuh/dog-dataset/resolve/main/dogs/dogs_{NAME:.0000}.wav\" -O {NAME}.wav\n", " elif TYPE == \"digit\":\n", " # george, jackson, lucas, nicolas, ...\n", " !wget -N f\"https://github.com/Jakobovski/free-spoken-digit-dataset/raw/master/recordings/0_george_{NAME}.wav\" -O {NAME}.wav\n", " elif TYPE == \"fsd50k\":\n", " !wget -N f\"https://huggingface.co/datasets/Fhrozen/FSD50k/blob/main/clips/dev/{10000+int(NAME)}.wav\" -O {NAME}.wav\n", " else:\n", " !wget -N f\"https://zunko.jp/sozai/utau/voice_{\"kiritan\" if NAME < 25 else \"itako\"}{NAME % 5 + 1}.wav\" -O {NAME}.wav\n", "from IPython.display import Audio, display\n", "\n", "display(Audio(f\"{NAME}.wav\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title Use trained model\n", "# @markdown **Put your .wav file in `so-vits-svc-fork/audio` directory**\n", "from IPython.display import Audio, display\n", "\n", "!svc infer drive/MyDrive/so-vits-svc-fork/audio/{NAME}.wav -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json\n", "display(Audio(f\"drive/MyDrive/so-vits-svc-fork/audio/{NAME}.out.wav\", autoplay=True))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##@title Use trained model (with cluster)\n", "!svc infer {NAME}.wav -s speaker -r 0.1 -m drive/MyDrive/so-vits-svc-fork/logs/44k/ -c drive/MyDrive/so-vits-svc-fork/logs/44k/config.json -k drive/MyDrive/so-vits-svc-fork/logs/44k/kmeans.pt\n", "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pretrained models" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/tree/main\n", "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/G_riri_220.pth\"\n", "!wget -N \"https://huggingface.co/TachibanaKimika/so-vits-svc-4.0-models/resolve/main/riri/config.json\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!svc infer {NAME}.wav -c config.json -m G_riri_220.pth\n", "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# @title https://huggingface.co/therealvul/so-vits-svc-4.0/tree/main\n", "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/G_166400.pth\"\n", "!wget -N \"https://huggingface.co/therealvul/so-vits-svc-4.0/resolve/main/Pinkie%20(speaking%20sep)/config.json\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!svc infer {NAME}.wav --speaker \"Pinkie {neutral}\" -c config.json -m G_166400.pth\n", "display(Audio(f\"{NAME}.out.wav\", autoplay=True))" ] } ], "metadata": { "accelerator": "GPU", "colab": { "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: pyproject.toml ================================================ [build-system] build-backend = "setuptools.build_meta" requires = [ "setuptools" ] [project] name = "so-vits-svc-fork" version = "4.2.30" description = "A fork of so-vits-svc." readme = "README.md" license = { text = "MIT" } authors = [ { name = "34j", email = "34j.95a2p@simplelogin.com" }, ] requires-python = ">=3.9" classifiers = [ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Topic :: Software Development :: Libraries", ] dependencies = [ "click>=8.1.8", "cm-time>=0.1.2", "fastapi>=0.116.1", "librosa>=0.11.0", "lightning>=2.5.5", "matplotlib>=3.9.4", "numpy>=2.0.2", "pebble>=5.1.3", "praat-parselmouth>=0.4.6", "psutil>=7.1.2", "pysimplegui-4-foss>=4.60.4.1", "pyworld>=0.3.5", "requests>=2.32.5", "rich>=14.1.0", "scipy>=1.13.1", "sounddevice>=0.5.2", "soundfile>=0.13.1", "tensorboard>=2.20.0", "tensorboardx>=2.6.4", "torch>=2.8.0", "torchaudio>=2.8.0", "torchcrepe>=0.0.24", "tqdm>=4.67.1", "tqdm-joblib>=0.0.4", "transformers>=4.56.1", ] urls."Bug Tracker" = "https://github.com/voicepaw/so-vits-svc-fork/issues" urls.Changelog = "https://github.com/voicepaw/so-vits-svc-fork/blob/main/CHANGELOG.md" urls.documentation = "https://so-vits-svc-fork.readthedocs.io" urls.repository = "https://github.com/voicepaw/so-vits-svc-fork" scripts.svc = "so_vits_svc_fork.__main__:cli" scripts.svcg = "so_vits_svc_fork.gui:main" [dependency-groups] dev = [ "pytest>=8,<9", "pytest-cov>=7,<8", ] docs = [ "furo>=2023.5.20; python_version>='3.11'", "myst-parser>=0.16; python_version>='3.11'", "sphinx>=4; python_version>='3.11'", "sphinx-autobuild>=2025,<2026; python_version>='3.11'", ] [tool.setuptools.package-data] "so_vits_svc_fork" = ["**/*.json"] [tool.ruff] line-length = 150 lint.select = [ # "B", # flake8-bugbear # "D", # flake8-docstrings # "C4", # flake8-comprehensions # "S", # flake8-bandit "F", # pyflake # "E", # pycodestyle "W", # pycodestyle # "UP", # pyupgrade "I", # isort # "RUF", # ruff specific ] lint.ignore = [ "D203", # 1 blank line required before class docstring "D212", # Multi-line docstring summary should start at the first line "D100", # Missing docstring in public module "D104", # Missing docstring in public package "D107", # Missing docstring in `__init__` "D401", # First line of docstring should be in imperative mood ] lint.per-file-ignores."conftest.py" = [ "D100" ] lint.per-file-ignores."docs/conf.py" = [ "D100" ] lint.per-file-ignores."setup.py" = [ "D100" ] lint.per-file-ignores."tests/**/*" = [ "D100", "D101", "D102", "D103", "D104", "S101", ] lint.isort.known-first-party = [ "so_vits_svc_fork", "tests" ] [tool.pytest.ini_options] addopts = """\ -v -Wdefault --cov=so_vits_svc_fork --cov-report=term --cov-report=xml """ pythonpath = [ "src" ] [tool.coverage.run] branch = true [tool.coverage.report] exclude_lines = [ "pragma: no cover", "@overload", "if TYPE_CHECKING", "raise NotImplementedError", 'if __name__ == "__main__":', ] [tool.mypy] check_untyped_defs = true disallow_any_generics = true disallow_incomplete_defs = true disallow_untyped_defs = true mypy_path = "src/" no_implicit_optional = true show_error_codes = true warn_unreachable = true warn_unused_ignores = true exclude = [ 'docs/.*', 'setup.py', ] [[tool.mypy.overrides]] module = "tests.*" allow_untyped_defs = true [[tool.mypy.overrides]] module = "docs.*" ignore_errors = true [tool.semantic_release] version_toml = [ "pyproject.toml:project.version" ] version_variables = [ "src/so_vits_svc_fork/__init__.py:__version__", "docs/conf.py:release", ] build_command = """ pip install uv uv lock git add uv.lock uv build """ [tool.semantic_release.changelog] exclude_commit_patterns = [ '''chore(?:\([^)]*?\))?: .+''', '''ci(?:\([^)]*?\))?: .+''', '''refactor(?:\([^)]*?\))?: .+''', '''style(?:\([^)]*?\))?: .+''', '''test(?:\([^)]*?\))?: .+''', '''build\((?!deps\): .+)''', '''Merged? .*''', '''Initial [Cc]ommit.*''', # codespell:ignore ] [tool.semantic_release.changelog.environment] keep_trailing_newline = true [tool.semantic_release.branches.main] match = "main" [tool.semantic_release.branches.noop] match = "(?!main$)" prerelease = true ================================================ FILE: renovate.json ================================================ { "extends": [ "config:best-practices", ":pinOnlyDevDependencies", ":automergeAll", ":enablePreCommit" ], "packageRules": [ { "matchPackageNames": ["python"], "rangeStrategy": "widen", "separateMultipleMinor": true } ] } ================================================ FILE: setup.py ================================================ #!/usr/bin/env python # This is a shim to allow GitHub to detect the package, build is done with uv # Taken from https://github.com/Textualize/rich import setuptools if __name__ == "__main__": setuptools.setup(name="so-vits-svc-fork") ================================================ FILE: src/so_vits_svc_fork/__init__.py ================================================ __version__ = "4.2.30" from .logger import init_logger init_logger() ================================================ FILE: src/so_vits_svc_fork/__main__.py ================================================ from __future__ import annotations import os from logging import getLogger from multiprocessing import freeze_support from pathlib import Path from typing import Literal import click import torch from so_vits_svc_fork import __version__ from so_vits_svc_fork.utils import get_optimal_device LOG = getLogger(__name__) IS_TEST = "test" in Path(__file__).parent.stem if IS_TEST: LOG.debug("Test mode is on.") class RichHelpFormatter(click.HelpFormatter): def __init__( self, indent_increment: int = 2, width: int | None = None, max_width: int | None = None, ) -> None: width = 100 super().__init__(indent_increment, width, max_width) LOG.info(f"Version: {__version__}") def patch_wrap_text(): orig_wrap_text = click.formatting.wrap_text def wrap_text( text, width=78, initial_indent="", subsequent_indent="", preserve_paragraphs=False, ): return orig_wrap_text( text.replace("\n", "\n\n"), width=width, initial_indent=initial_indent, subsequent_indent=subsequent_indent, preserve_paragraphs=True, ).replace("\n\n", "\n") click.formatting.wrap_text = wrap_text patch_wrap_text() CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], show_default=True) click.Context.formatter_class = RichHelpFormatter @click.group(context_settings=CONTEXT_SETTINGS) def cli(): """ so-vits-svc allows any folder structure for training data. However, the following folder structure is recommended.\n When training: dataset_raw/{speaker_name}/**/{wav_name}.{any_format}\n When inference: configs/44k/config.json, logs/44k/G_XXXX.pth\n If the folder structure is followed, you DO NOT NEED TO SPECIFY model path, config path, etc. (The latest model will be automatically loaded.)\n To train a model, run pre-resample, pre-config, pre-hubert, train.\n To infer a model, run infer. """ @cli.command() @click.option( "-c", "--config-path", type=click.Path(exists=True), help="path to config", default=Path("./configs/44k/config.json"), ) @click.option( "-m", "--model-path", type=click.Path(), help="path to output dir", default=Path("./logs/44k"), ) @click.option( "-t/-nt", "--tensorboard/--no-tensorboard", default=False, type=bool, help="launch tensorboard", ) @click.option( "-r", "--reset-optimizer", default=False, type=bool, help="reset optimizer", is_flag=True, ) def train( config_path: Path, model_path: Path, tensorboard: bool = False, reset_optimizer: bool = False, ): """ Train model If D_0.pth or G_0.pth not found, automatically download from hub. """ from .train import train config_path = Path(config_path) model_path = Path(model_path) if tensorboard: import webbrowser from tensorboard import program getLogger("tensorboard").setLevel(30) tb = program.TensorBoard() tb.configure(argv=[None, "--logdir", model_path.as_posix()]) url = tb.launch() webbrowser.open(url) train(config_path=config_path, model_path=model_path, reset_optimizer=reset_optimizer) @cli.command() def gui(): """ Opens GUI for conversion and realtime inference """ from .gui import main main() @cli.command() @click.argument( "input-path", type=click.Path(exists=True), ) @click.option( "-o", "--output-path", type=click.Path(), help="path to output dir", ) @click.option("-s", "--speaker", type=str, default=None, help="speaker name") @click.option( "-m", "--model-path", type=click.Path(exists=True), default=Path("./logs/44k/"), help="path to model", ) @click.option( "-c", "--config-path", type=click.Path(exists=True), default=Path("./configs/44k/config.json"), help="path to config", ) @click.option( "-k", "--cluster-model-path", type=click.Path(exists=True), default=None, help="path to cluster model", ) @click.option( "-re", "--recursive", type=bool, default=False, help="Search recursively", is_flag=True, ) @click.option("-t", "--transpose", type=int, default=0, help="transpose") @click.option("-db", "--db-thresh", type=int, default=-20, help="threshold (DB) (RELATIVE)") @click.option( "-fm", "--f0-method", type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]), default="dio", help="f0 prediction method", ) @click.option( "-a/-na", "--auto-predict-f0/--no-auto-predict-f0", type=bool, default=True, help="auto predict f0", ) @click.option("-r", "--cluster-infer-ratio", type=float, default=0, help="cluster infer ratio") @click.option("-n", "--noise-scale", type=float, default=0.4, help="noise scale") @click.option("-p", "--pad-seconds", type=float, default=0.5, help="pad seconds") @click.option( "-d", "--device", type=str, default=get_optimal_device(), help="device", ) @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds") @click.option( "-ab/-nab", "--absolute-thresh/--no-absolute-thresh", type=bool, default=False, help="absolute thresh", ) @click.option( "-mc", "--max-chunk-seconds", type=float, default=40, help="maximum allowed single chunk length, set lower if you get out of memory (0 to disable)", ) def infer( # paths input_path: Path, output_path: Path, model_path: Path, config_path: Path, recursive: bool, # svc config speaker: str, cluster_model_path: Path | None = None, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, max_chunk_seconds: float = 40, device: str | torch.device = get_optimal_device(), ): """Inference""" from so_vits_svc_fork.inference.main import infer if not auto_predict_f0: LOG.warning( f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please set transpose." "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different." ) input_path = Path(input_path) if output_path is None: output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}" output_path = Path(output_path) if input_path.is_dir() and not recursive: raise ValueError("input_path is a directory. Use 0re or --recursive to infer recursively.") model_path = Path(model_path) if model_path.is_dir(): model_path = sorted(model_path.glob("G_*.pth"), key=lambda x: x.stat().st_mtime)[-1] LOG.info(f"Since model_path is a directory, use {model_path}") config_path = Path(config_path) if cluster_model_path is not None: cluster_model_path = Path(cluster_model_path) infer( # paths input_path=input_path, output_path=output_path, model_path=model_path, config_path=config_path, recursive=recursive, # svc config speaker=speaker, cluster_model_path=cluster_model_path, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, # slice config db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, max_chunk_seconds=max_chunk_seconds, device=device, ) @cli.command() @click.option( "-m", "--model-path", type=click.Path(exists=True), default=Path("./logs/44k/"), help="path to model", ) @click.option( "-c", "--config-path", type=click.Path(exists=True), default=Path("./configs/44k/config.json"), help="path to config", ) @click.option( "-k", "--cluster-model-path", type=click.Path(exists=True), default=None, help="path to cluster model", ) @click.option("-t", "--transpose", type=int, default=12, help="transpose") @click.option( "-a/-na", "--auto-predict-f0/--no-auto-predict-f0", type=bool, default=True, help="auto predict f0 (not recommended for realtime since voice pitch will not be stable)", ) @click.option("-r", "--cluster-infer-ratio", type=float, default=0, help="cluster infer ratio") @click.option("-n", "--noise-scale", type=float, default=0.4, help="noise scale") @click.option("-db", "--db-thresh", type=int, default=-30, help="threshold (DB) (ABSOLUTE)") @click.option( "-fm", "--f0-method", type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]), default="dio", help="f0 prediction method", ) @click.option("-p", "--pad-seconds", type=float, default=0.02, help="pad seconds") @click.option("-ch", "--chunk-seconds", type=float, default=0.5, help="chunk seconds") @click.option( "-cr", "--crossfade-seconds", type=float, default=0.01, help="crossfade seconds", ) @click.option( "-ab", "--additional-infer-before-seconds", type=float, default=0.2, help="additional infer before seconds", ) @click.option( "-aa", "--additional-infer-after-seconds", type=float, default=0.1, help="additional infer after seconds", ) @click.option("-b", "--block-seconds", type=float, default=0.5, help="block seconds") @click.option( "-d", "--device", type=str, default=get_optimal_device(), help="device", ) @click.option("-s", "--speaker", type=str, default=None, help="speaker name") @click.option("-v", "--version", type=int, default=2, help="version") @click.option("-i", "--input-device", type=int, default=None, help="input device") @click.option("-o", "--output-device", type=int, default=None, help="output device") @click.option( "-po", "--passthrough-original", type=bool, default=False, is_flag=True, help="passthrough original (for latency check)", ) def vc( # paths model_path: Path, config_path: Path, # svc config speaker: str, cluster_model_path: Path | None, transpose: int, auto_predict_f0: bool, cluster_infer_ratio: float, noise_scale: float, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], # slice config db_thresh: int, pad_seconds: float, chunk_seconds: float, # realtime config crossfade_seconds: float, additional_infer_before_seconds: float, additional_infer_after_seconds: float, block_seconds: float, version: int, input_device: int | str | None, output_device: int | str | None, device: torch.device, passthrough_original: bool = False, ) -> None: """Realtime inference from microphone""" from so_vits_svc_fork.inference.main import realtime if auto_predict_f0: LOG.warning("auto_predict_f0 = True in realtime inference will cause unstable voice pitch, use with caution") else: LOG.warning( f"auto_predict_f0 = False, transpose = {transpose}. If you want to change the pitch, please change the transpose value." "Generally transpose = 0 does not work because your voice pitch and target voice pitch are different." ) model_path = Path(model_path) config_path = Path(config_path) if cluster_model_path is not None: cluster_model_path = Path(cluster_model_path) if model_path.is_dir(): model_path = sorted(model_path.glob("G_*.pth"), key=lambda x: x.stat().st_mtime)[-1] LOG.info(f"Since model_path is a directory, use {model_path}") realtime( # paths model_path=model_path, config_path=config_path, # svc config speaker=speaker, cluster_model_path=cluster_model_path, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, # slice config db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, # realtime config crossfade_seconds=crossfade_seconds, additional_infer_before_seconds=additional_infer_before_seconds, additional_infer_after_seconds=additional_infer_after_seconds, block_seconds=block_seconds, version=version, input_device=input_device, output_device=output_device, device=device, passthrough_original=passthrough_original, ) @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), default=Path("./dataset_raw"), help="path to source dir", ) @click.option( "-o", "--output-dir", type=click.Path(), default=Path("./dataset/44k"), help="path to output dir", ) @click.option("-s", "--sampling-rate", type=int, default=44100, help="sampling rate") @click.option( "-n", "--n-jobs", type=int, default=-1, help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)", ) @click.option("-d", "--top-db", type=float, default=30, help="top db") @click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds") @click.option("-ho", "-hop", "--hop-seconds", type=float, default=0.3, help="hop seconds") def pre_resample( input_dir: Path, output_dir: Path, sampling_rate: int, n_jobs: int, top_db: int, frame_seconds: float, hop_seconds: float, ) -> None: """Preprocessing part 1: resample""" from so_vits_svc_fork.preprocessing.preprocess_resample import preprocess_resample input_dir = Path(input_dir) output_dir = Path(output_dir) preprocess_resample( input_dir=input_dir, output_dir=output_dir, sampling_rate=sampling_rate, n_jobs=n_jobs, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds, ) from so_vits_svc_fork.preprocessing.preprocess_flist_config import CONFIG_TEMPLATE_DIR @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), default=Path("./dataset/44k"), help="path to source dir", ) @click.option( "-f", "--filelist-path", type=click.Path(), default=Path("./filelists/44k"), help="path to filelist dir", ) @click.option( "-c", "--config-path", type=click.Path(), default=Path("./configs/44k/config.json"), help="path to config", ) @click.option( "-t", "--config-type", type=click.Choice([x.stem for x in CONFIG_TEMPLATE_DIR.rglob("*.json")]), default="so-vits-svc-4.0v1", help="config type", ) def pre_config( input_dir: Path, filelist_path: Path, config_path: Path, config_type: str, ): """Preprocessing part 2: config""" from so_vits_svc_fork.preprocessing.preprocess_flist_config import preprocess_config input_dir = Path(input_dir) filelist_path = Path(filelist_path) config_path = Path(config_path) preprocess_config( input_dir=input_dir, train_list_path=filelist_path / "train.txt", val_list_path=filelist_path / "val.txt", test_list_path=filelist_path / "test.txt", config_path=config_path, config_name=config_type, ) @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), default=Path("./dataset/44k"), help="path to source dir", ) @click.option( "-c", "--config-path", type=click.Path(exists=True), help="path to config", default=Path("./configs/44k/config.json"), ) @click.option( "-n", "--n-jobs", type=int, default=None, help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)", ) @click.option( "-f/-nf", "--force-rebuild/--no-force-rebuild", type=bool, default=True, help="force rebuild existing preprocessed files", ) @click.option( "-fm", "--f0-method", type=click.Choice(["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"]), default="dio", ) def pre_hubert( input_dir: Path, config_path: Path, n_jobs: bool, force_rebuild: bool, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], ) -> None: """ Preprocessing part 3: hubert If the HuBERT model is not found, it will be downloaded automatically. """ from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import preprocess_hubert_f0 input_dir = Path(input_dir) config_path = Path(config_path) preprocess_hubert_f0( input_dir=input_dir, config_path=config_path, n_jobs=n_jobs, force_rebuild=force_rebuild, f0_method=f0_method, ) @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), default=Path("./dataset_raw_raw/"), help="path to source dir", ) @click.option( "-o", "--output-dir", type=click.Path(), default=Path("./dataset_raw/"), help="path to output dir", ) @click.option( "-n", "--n-jobs", type=int, default=-1, help="number of jobs (optimal value may depend on your VRAM capacity and audio duration per file)", ) @click.option("-min", "--min-speakers", type=int, default=2, help="min speakers") @click.option("-max", "--max-speakers", type=int, default=2, help="max speakers") @click.option("-t", "--huggingface-token", type=str, default=None, help="huggingface token") @click.option("-s", "--sr", type=int, default=44100, help="sampling rate") def pre_sd( input_dir: Path | str, output_dir: Path | str, min_speakers: int, max_speakers: int, huggingface_token: str | None, n_jobs: int, sr: int, ): """Speech diarization using pyannote.audio""" if huggingface_token is None: huggingface_token = os.environ.get("HUGGINGFACE_TOKEN", None) if huggingface_token is None: huggingface_token = click.prompt("Please enter your HuggingFace token", hide_input=True) if os.environ.get("HUGGINGFACE_TOKEN", None) is None: LOG.info("You can also set the HUGGINGFACE_TOKEN environment variable.") assert huggingface_token is not None huggingface_token = huggingface_token.rstrip(" \n\r\t\0") if len(huggingface_token) <= 1: raise ValueError("HuggingFace token is empty: " + huggingface_token) if max_speakers == 1: LOG.warning("Consider using pre-split if max_speakers == 1") from so_vits_svc_fork.preprocessing.preprocess_speaker_diarization import ( preprocess_speaker_diarization, ) preprocess_speaker_diarization( input_dir=input_dir, output_dir=output_dir, min_speakers=min_speakers, max_speakers=max_speakers, huggingface_token=huggingface_token, n_jobs=n_jobs, sr=sr, ) @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), default=Path("./dataset_raw_raw/"), help="path to source dir", ) @click.option( "-o", "--output-dir", type=click.Path(), default=Path("./dataset_raw/"), help="path to output dir", ) @click.option( "-n", "--n-jobs", type=int, default=-1, help="number of jobs (optimal value may depend on your RAM capacity and audio duration per file)", ) @click.option( "-l", "--max-length", type=float, default=10, help="max length of each split in seconds", ) @click.option("-d", "--top-db", type=float, default=30, help="top db") @click.option("-f", "--frame-seconds", type=float, default=1, help="frame seconds") @click.option("-ho", "-hop", "--hop-seconds", type=float, default=0.3, help="hop seconds") @click.option("-s", "--sr", type=int, default=44100, help="sample rate") def pre_split( input_dir: Path | str, output_dir: Path | str, max_length: float, top_db: int, frame_seconds: float, hop_seconds: float, n_jobs: int, sr: int, ): """Split audio files into multiple files""" from so_vits_svc_fork.preprocessing.preprocess_split import preprocess_split preprocess_split( input_dir=input_dir, output_dir=output_dir, max_length=max_length, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds, n_jobs=n_jobs, sr=sr, ) @cli.command() @click.option( "-i", "--input-dir", type=click.Path(exists=True), required=True, help="path to source dir", ) @click.option( "-o", "--output-dir", type=click.Path(), default=None, help="path to output dir", ) @click.option( "-c/-nc", "--create-new/--no-create-new", type=bool, default=True, help="create a new folder for the speaker if not exist", ) def pre_classify( input_dir: Path | str, output_dir: Path | str | None, create_new: bool, ) -> None: """Classify multiple audio files into multiple files""" from so_vits_svc_fork.preprocessing.preprocess_classify import preprocess_classify if output_dir is None: output_dir = input_dir preprocess_classify( input_dir=input_dir, output_dir=output_dir, create_new=create_new, ) @cli.command def clean(): """Clean up files, only useful if you are using the default file structure""" import shutil folders = ["dataset", "filelists", "logs"] # if pyip.inputYesNo(f"Are you sure you want to delete files in {folders}?") == "yes": if input("Are you sure you want to delete files in {folders}?") in ["yes", "y"]: for folder in folders: if Path(folder).exists(): shutil.rmtree(folder) LOG.info("Cleaned up files") else: LOG.info("Aborted") @cli.command @click.option( "-i", "--input-path", type=click.Path(exists=True), help="model path", default=Path("./logs/44k/"), ) @click.option( "-o", "--output-path", type=click.Path(), help="onnx model path to save", default=None, ) @click.option( "-c", "--config-path", type=click.Path(), help="config path", default=Path("./configs/44k/config.json"), ) @click.option( "-d", "--device", type=str, default="cpu", help="device to use", ) def onnx(input_path: Path, output_path: Path, config_path: Path, device: torch.device | str) -> None: """Export model to onnx (currently not working)""" raise NotImplementedError("ONNX export is not yet supported") input_path = Path(input_path) if input_path.is_dir(): input_path = list(input_path.glob("*.pth"))[0] if output_path is None: output_path = input_path.with_suffix(".onnx") output_path = Path(output_path) if output_path.is_dir(): output_path = output_path / (input_path.stem + ".onnx") config_path = Path(config_path) device_ = torch.device(device) from so_vits_svc_fork.modules.onnx._export import onnx_export onnx_export( input_path=input_path, output_path=output_path, config_path=config_path, device=device_, ) @cli.command @click.option( "-i", "--input-dir", type=click.Path(exists=True), help="dataset directory", default=Path("./dataset/44k"), ) @click.option( "-o", "--output-path", type=click.Path(), help="model path to save", default=Path("./logs/44k/kmeans.pt"), ) @click.option("-n", "--n-clusters", type=int, help="number of clusters", default=2000) @click.option("-m/-nm", "--minibatch/--no-minibatch", default=True, help="use minibatch k-means") @click.option("-b", "--batch-size", type=int, default=4096, help="batch size for minibatch kmeans") @click.option("-p/-np", "--partial-fit", default=False, help="use partial fit (only use with -m)") def train_cluster( input_dir: Path, output_path: Path, n_clusters: int, minibatch: bool, batch_size: int, partial_fit: bool, ) -> None: """Train k-means clustering""" from .cluster.train_cluster import main main( input_dir=input_dir, output_path=output_path, n_clusters=n_clusters, verbose=True, use_minibatch=minibatch, batch_size=batch_size, partial_fit=partial_fit, ) if __name__ == "__main__": freeze_support() cli() ================================================ FILE: src/so_vits_svc_fork/cluster/__init__.py ================================================ from __future__ import annotations from pathlib import Path from typing import Any import torch from sklearn.cluster import KMeans def get_cluster_model(ckpt_path: Path | str): with Path(ckpt_path).open("rb") as f: checkpoint = torch.load(f, map_location="cpu") # Danger of arbitrary code execution kmeans_dict = {} for spk, ckpt in checkpoint.items(): km = KMeans(ckpt["n_features_in_"]) km.__dict__["n_features_in_"] = ckpt["n_features_in_"] km.__dict__["_n_threads"] = ckpt["_n_threads"] km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"] kmeans_dict[spk] = km return kmeans_dict def check_speaker(model: Any, speaker: Any): if speaker not in model: raise ValueError(f"Speaker {speaker} not in {list(model.keys())}") def get_cluster_result(model: Any, x: Any, speaker: Any): """ x: np.array [t, 256] return cluster class result """ check_speaker(model, speaker) return model[speaker].predict(x) def get_cluster_center_result(model: Any, x: Any, speaker: Any): """x: np.array [t, 256]""" check_speaker(model, speaker) predict = model[speaker].predict(x) return model[speaker].cluster_centers_[predict] def get_center(model: Any, x: Any, speaker: Any): check_speaker(model, speaker) return model[speaker].cluster_centers_[x] ================================================ FILE: src/so_vits_svc_fork/cluster/train_cluster.py ================================================ from __future__ import annotations import math from logging import getLogger from pathlib import Path from typing import Any import numpy as np import torch from cm_time import timer from joblib import Parallel, delayed from sklearn.cluster import KMeans, MiniBatchKMeans from tqdm_joblib import tqdm_joblib LOG = getLogger(__name__) def train_cluster( input_dir: Path | str, n_clusters: int, use_minibatch: bool = True, batch_size: int = 4096, partial_fit: bool = False, verbose: bool = False, ) -> dict: input_dir = Path(input_dir) if not partial_fit: LOG.info(f"Loading features from {input_dir}") features = [] for path in input_dir.rglob("*.data.pt"): with path.open("rb") as f: features.append(torch.load(f, weights_only=True)["content"].squeeze(0).numpy().T) if not features: raise ValueError(f"No features found in {input_dir}") features = np.concatenate(features, axis=0).astype(np.float32) if features.shape[0] < n_clusters: raise ValueError("Too few HuBERT features to cluster. Consider using a smaller number of clusters.") LOG.info(f"shape: {features.shape}, size: {features.nbytes / 1024**2:.2f} MB, dtype: {features.dtype}") with timer() as t: if use_minibatch: kmeans = MiniBatchKMeans( n_clusters=n_clusters, verbose=verbose, batch_size=batch_size, max_iter=80, n_init="auto", ).fit(features) else: kmeans = KMeans(n_clusters=n_clusters, verbose=verbose, n_init="auto").fit(features) LOG.info(f"Clustering took {t.elapsed:.2f} seconds") x = { "n_features_in_": kmeans.n_features_in_, "_n_threads": kmeans._n_threads, "cluster_centers_": kmeans.cluster_centers_, } return x else: # minibatch partial fit paths = list(input_dir.rglob("*.data.pt")) if len(paths) == 0: raise ValueError(f"No features found in {input_dir}") LOG.info(f"Found {len(paths)} features in {input_dir}") n_batches = math.ceil(len(paths) / batch_size) LOG.info(f"Splitting into {n_batches} batches") with timer() as t: kmeans = MiniBatchKMeans( n_clusters=n_clusters, verbose=verbose, batch_size=batch_size, max_iter=80, n_init="auto", ) for i in range(0, len(paths), batch_size): LOG.info(f"Processing batch {i // batch_size + 1}/{n_batches} for speaker {input_dir.stem}") features = [] for path in paths[i : i + batch_size]: with path.open("rb") as f: features.append(torch.load(f, weights_only=True)["content"].squeeze(0).numpy().T) features = np.concatenate(features, axis=0).astype(np.float32) kmeans.partial_fit(features) LOG.info(f"Clustering took {t.elapsed:.2f} seconds") x = { "n_features_in_": kmeans.n_features_in_, "_n_threads": kmeans._n_threads, "cluster_centers_": kmeans.cluster_centers_, } return x def main( input_dir: Path | str, output_path: Path | str, n_clusters: int = 10000, use_minibatch: bool = True, batch_size: int = 4096, partial_fit: bool = False, verbose: bool = False, ) -> None: input_dir = Path(input_dir) output_path = Path(output_path) if not (use_minibatch or not partial_fit): raise ValueError("partial_fit requires use_minibatch") def train_cluster_(input_path: Path, **kwargs: Any) -> tuple[str, dict]: return input_path.stem, train_cluster(input_path, **kwargs) with tqdm_joblib(desc="Training clusters", total=len(list(input_dir.iterdir()))): parallel_result = Parallel(n_jobs=-1)( delayed(train_cluster_)( speaker_name, n_clusters=n_clusters, use_minibatch=use_minibatch, batch_size=batch_size, partial_fit=partial_fit, verbose=verbose, ) for speaker_name in input_dir.iterdir() ) assert parallel_result is not None checkpoint = dict(parallel_result) output_path.parent.mkdir(exist_ok=True, parents=True) with output_path.open("wb") as f: torch.save(checkpoint, f) ================================================ FILE: src/so_vits_svc_fork/dataset.py ================================================ from __future__ import annotations from collections.abc import Sequence from pathlib import Path from random import Random import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset from .hparams import HParams class TextAudioDataset(Dataset): def __init__(self, hps: HParams, is_validation: bool = False): self.datapaths = [ Path(x).parent / (Path(x).name + ".data.pt") for x in Path(hps.data.validation_files if is_validation else hps.data.training_files).read_text("utf-8").splitlines() ] self.hps = hps self.random = Random(hps.train.seed) self.random.shuffle(self.datapaths) self.max_spec_len = 800 def __getitem__(self, index: int) -> dict[str, torch.Tensor]: with Path(self.datapaths[index]).open("rb") as f: data = torch.load(f, weights_only=True, map_location="cpu") # cut long data randomly spec_len = data["mel_spec"].shape[1] hop_len = self.hps.data.hop_length if spec_len > self.max_spec_len: start = self.random.randint(0, spec_len - self.max_spec_len) end = start + self.max_spec_len - 10 for key in data.keys(): if key == "audio": data[key] = data[key][:, start * hop_len : end * hop_len] elif key == "spk": continue else: data[key] = data[key][..., start:end] torch.cuda.empty_cache() return data def __len__(self) -> int: return len(self.datapaths) def _pad_stack(array: Sequence[torch.Tensor]) -> torch.Tensor: max_idx = torch.argmax(torch.tensor([x_.shape[-1] for x_ in array])) max_x = array[max_idx] x_padded = [F.pad(x_, (0, max_x.shape[-1] - x_.shape[-1]), mode="constant", value=0) for x_ in array] return torch.stack(x_padded) class TextAudioCollate(nn.Module): def forward(self, batch: Sequence[dict[str, torch.Tensor]]) -> tuple[torch.Tensor, ...]: batch = [b for b in batch if b is not None] batch = sorted(batch, key=lambda x: x["mel_spec"].shape[1], reverse=True) lengths = torch.tensor([b["mel_spec"].shape[1] for b in batch]).long() results = {} for key in batch[0].keys(): if key not in ["spk"]: results[key] = _pad_stack([b[key] for b in batch]).cpu() else: results[key] = torch.tensor([[b[key]] for b in batch]).cpu() return ( results["content"], results["f0"], results["spec"], results["mel_spec"], results["audio"], results["spk"], lengths, results["uv"], ) ================================================ FILE: src/so_vits_svc_fork/default_gui_presets.json ================================================ { "Default VC (GPU, GTX 1060)": { "silence_threshold": -35.0, "transpose": 12.0, "auto_predict_f0": false, "f0_method": "dio", "cluster_infer_ratio": 0.0, "noise_scale": 0.4, "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 0.35, "additional_infer_before_seconds": 0.15, "additional_infer_after_seconds": 0.1, "realtime_algorithm": "1 (Divide constantly)", "passthrough_original": false, "use_gpu": true }, "Default VC (CPU)": { "silence_threshold": -35.0, "transpose": 12.0, "auto_predict_f0": false, "f0_method": "dio", "cluster_infer_ratio": 0.0, "noise_scale": 0.4, "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 1.5, "additional_infer_before_seconds": 0.01, "additional_infer_after_seconds": 0.01, "realtime_algorithm": "1 (Divide constantly)", "passthrough_original": false, "use_gpu": false }, "Default VC (Mobile CPU)": { "silence_threshold": -35.0, "transpose": 12.0, "auto_predict_f0": false, "f0_method": "dio", "cluster_infer_ratio": 0.0, "noise_scale": 0.4, "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, "max_chunk_seconds": 40, "crossfade_seconds": 0.05, "block_seconds": 2.5, "additional_infer_before_seconds": 0.01, "additional_infer_after_seconds": 0.01, "realtime_algorithm": "1 (Divide constantly)", "passthrough_original": false, "use_gpu": false }, "Default VC (Crooning)": { "silence_threshold": -35.0, "transpose": 12.0, "auto_predict_f0": false, "f0_method": "dio", "cluster_infer_ratio": 0.0, "noise_scale": 0.4, "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, "max_chunk_seconds": 40, "crossfade_seconds": 0.04, "block_seconds": 0.15, "additional_infer_before_seconds": 0.05, "additional_infer_after_seconds": 0.05, "realtime_algorithm": "1 (Divide constantly)", "passthrough_original": false, "use_gpu": true }, "Default File": { "silence_threshold": -35.0, "transpose": 0.0, "auto_predict_f0": true, "f0_method": "crepe", "cluster_infer_ratio": 0.0, "noise_scale": 0.4, "pad_seconds": 0.1, "chunk_seconds": 0.5, "absolute_thresh": true, "max_chunk_seconds": 40, "auto_play": true, "passthrough_original": false } } ================================================ FILE: src/so_vits_svc_fork/f0.py ================================================ from __future__ import annotations from logging import getLogger from typing import Any, Literal import numpy as np import torch import torchcrepe from cm_time import timer from numpy import dtype, float32, ndarray from torch import FloatTensor, Tensor from so_vits_svc_fork.utils import get_optimal_device LOG = getLogger(__name__) def normalize_f0(f0: FloatTensor, x_mask: FloatTensor, uv: FloatTensor, random_scale=True) -> FloatTensor: # calculate means based on x_mask uv_sum = torch.sum(uv, dim=1, keepdim=True) uv_sum[uv_sum == 0] = 9999 means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum if random_scale: factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device) else: factor = torch.ones(f0.shape[0], 1).to(f0.device) # normalize f0 based on means and factor f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1) if torch.isnan(f0_norm).any(): exit(0) return f0_norm * x_mask def interpolate_f0( f0: ndarray[Any, dtype[float32]], ) -> tuple[ndarray[Any, dtype[float32]], ndarray[Any, dtype[float32]]]: data = np.reshape(f0, (f0.size, 1)) vuv_vector = np.zeros((data.size, 1), dtype=np.float32) vuv_vector[data > 0.0] = 1.0 vuv_vector[data <= 0.0] = 0.0 ip_data = data frame_number = data.size last_value = 0.0 for i in range(frame_number): if data[i] <= 0.0: j = i + 1 for j in range(i + 1, frame_number): if data[j] > 0.0: break if j < frame_number - 1: if last_value > 0.0: step = (data[j] - data[i - 1]) / float(j - i) for k in range(i, j): ip_data[k] = data[i - 1] + step * (k - i + 1) else: for k in range(i, j): ip_data[k] = data[j] else: for k in range(i, frame_number): ip_data[k] = last_value else: ip_data[i] = data[i] last_value = data[i] return ip_data[:, 0], vuv_vector[:, 0] def compute_f0_parselmouth( wav_numpy: ndarray[Any, dtype[float32]], p_len: None | int = None, sampling_rate: int = 44100, hop_length: int = 512, ): import parselmouth x = wav_numpy if p_len is None: p_len = x.shape[0] // hop_length else: assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error" time_step = hop_length / sampling_rate * 1000 f0_min = 50 f0_max = 1100 f0 = ( parselmouth.Sound(x, sampling_rate) .to_pitch_ac( time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max, ) .selected_array["frequency"] ) pad_size = (p_len - len(f0) + 1) // 2 if pad_size > 0 or p_len - len(f0) - pad_size > 0: f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") return f0 def _resize_f0(x: ndarray[Any, dtype[float32]], target_len: int) -> ndarray[Any, dtype[float32]]: source = np.array(x) source[source < 0.001] = np.nan target = np.interp( np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source, ) res = np.nan_to_num(target) return res def compute_f0_pyworld( wav_numpy: ndarray[Any, dtype[float32]], p_len: None | int = None, sampling_rate: int = 44100, hop_length: int = 512, type_: Literal["dio", "harvest"] = "dio", ): import pyworld if p_len is None: p_len = wav_numpy.shape[0] // hop_length if type_ == "dio": f0, t = pyworld.dio( wav_numpy.astype(np.double), fs=sampling_rate, f0_ceil=f0_max, f0_floor=f0_min, frame_period=1000 * hop_length / sampling_rate, ) elif type_ == "harvest": f0, t = pyworld.harvest( wav_numpy.astype(np.double), fs=sampling_rate, f0_ceil=f0_max, f0_floor=f0_min, frame_period=1000 * hop_length / sampling_rate, ) f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) for index, pitch in enumerate(f0): f0[index] = round(pitch, 1) return _resize_f0(f0, p_len) def compute_f0_crepe( wav_numpy: ndarray[Any, dtype[float32]], p_len: None | int = None, sampling_rate: int = 44100, hop_length: int = 512, device: str | torch.device = get_optimal_device(), model: Literal["full", "tiny"] = "full", ): audio = torch.from_numpy(wav_numpy).to(device, copy=True) audio = torch.unsqueeze(audio, dim=0) if audio.ndim == 2 and audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True).detach() # (T) -> (1, T) audio = audio.detach() pitch: Tensor = torchcrepe.predict( audio, sampling_rate, hop_length, f0_min, f0_max, model, batch_size=hop_length * 2, device=device, pad=True, ) f0 = pitch.squeeze(0).cpu().float().numpy() p_len = p_len or wav_numpy.shape[0] // hop_length f0 = _resize_f0(f0, p_len) return f0 def compute_f0( wav_numpy: ndarray[Any, dtype[float32]], p_len: None | int = None, sampling_rate: int = 44100, hop_length: int = 512, method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", **kwargs, ): with timer() as t: wav_numpy = wav_numpy.astype(np.float32) wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999) if method in ["dio", "harvest"]: f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method) elif method == "crepe": f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs) elif method == "crepe-tiny": f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs) elif method == "parselmouth": f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length) else: raise ValueError("type must be dio, crepe, crepe-tiny, harvest or parselmouth") rtf = t.elapsed / (len(wav_numpy) / sampling_rate) LOG.info(f"F0 inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") return f0 def f0_to_coarse(f0: torch.Tensor | float): is_torch = isinstance(f0, torch.Tensor) f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( f0_coarse.max(), f0_coarse.min(), ) return f0_coarse f0_bin = 256 f0_max = 1100.0 f0_min = 50.0 f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700) ================================================ FILE: src/so_vits_svc_fork/gui.py ================================================ from __future__ import annotations import json import multiprocessing import os from copy import copy from logging import getLogger from pathlib import Path import PySimpleGUI as sg import sounddevice as sd import soundfile as sf import torch from pebble import ProcessFuture, ProcessPool from . import __version__ from .utils import get_optimal_device GUI_DEFAULT_PRESETS_PATH = Path(__file__).parent / "default_gui_presets.json" GUI_PRESETS_PATH = Path("./user_gui_presets.json").absolute() LOG = getLogger(__name__) def play_audio(path: Path | str): if isinstance(path, Path): path = path.as_posix() data, sr = sf.read(path) sd.play(data, sr) def load_presets() -> dict: defaults = json.loads(GUI_DEFAULT_PRESETS_PATH.read_text("utf-8")) users = json.loads(GUI_PRESETS_PATH.read_text("utf-8")) if GUI_PRESETS_PATH.exists() else {} # prioriy: defaults > users # order: defaults -> users return {**defaults, **users, **defaults} def add_preset(name: str, preset: dict) -> dict: presets = load_presets() presets[name] = preset with GUI_PRESETS_PATH.open("w") as f: json.dump(presets, f, indent=2) return load_presets() def delete_preset(name: str) -> dict: presets = load_presets() if name in presets: del presets[name] else: LOG.warning(f"Cannot delete preset {name} because it does not exist.") with GUI_PRESETS_PATH.open("w") as f: json.dump(presets, f, indent=2) return load_presets() def get_output_path(input_path: Path) -> Path: # Default output path output_path = input_path.parent / f"{input_path.stem}.out{input_path.suffix}" # Increment file number in path if output file already exists file_num = 1 while output_path.exists(): output_path = input_path.parent / f"{input_path.stem}.out_{file_num}{input_path.suffix}" file_num += 1 return output_path def get_supported_file_types() -> tuple[tuple[str, str], ...]: res = tuple([(extension, f".{extension.lower()}") for extension in sf.available_formats().keys()]) # Sort by popularity common_file_types = ["WAV", "MP3", "FLAC", "OGG", "M4A", "WMA"] res = sorted( res, key=lambda x: (common_file_types.index(x[0]) if x[0] in common_file_types else len(common_file_types)), ) return res def get_supported_file_types_concat() -> tuple[tuple[str, str], ...]: return (("Audio", " ".join(sf.available_formats().keys())),) def validate_output_file_type(output_path: Path) -> bool: supported_file_types = sorted([f".{extension.lower()}" for extension in sf.available_formats().keys()]) if not output_path.suffix: sg.popup_ok("Error: Output path missing file type extension, enter " + "one of the following manually:\n\n" + "\n".join(supported_file_types)) return False if output_path.suffix.lower() not in supported_file_types: sg.popup_ok( f"Error: {output_path.suffix.lower()} is not a supported " + "extension; use one of the following:\n\n" + "\n".join(supported_file_types) ) return False return True def get_devices( update: bool = True, ) -> tuple[list[str], list[str], list[int], list[int]]: if update: sd._terminate() sd._initialize() devices = sd.query_devices() hostapis = sd.query_hostapis() for hostapi in hostapis: for device_idx in hostapi["devices"]: devices[device_idx]["hostapi_name"] = hostapi["name"] input_devices = [f"{d['name']} ({d['hostapi_name']})" for d in devices if d["max_input_channels"] > 0] output_devices = [f"{d['name']} ({d['hostapi_name']})" for d in devices if d["max_output_channels"] > 0] input_devices_indices = [d["index"] for d in devices if d["max_input_channels"] > 0] output_devices_indices = [d["index"] for d in devices if d["max_output_channels"] > 0] return input_devices, output_devices, input_devices_indices, output_devices_indices def after_inference(window: sg.Window, path: Path, auto_play: bool, output_path: Path): try: LOG.info(f"Finished inference for {path.stem}{path.suffix}") window["infer"].update(disabled=False) if auto_play: play_audio(output_path) except Exception as e: LOG.exception(e) def main(): LOG.info(f"version: {__version__}") # sg.theme("Dark") sg.theme_add_new( "Very Dark", { "BACKGROUND": "#111111", "TEXT": "#FFFFFF", "INPUT": "#444444", "TEXT_INPUT": "#FFFFFF", "SCROLL": "#333333", "BUTTON": ("white", "#112233"), "PROGRESS": ("#111111", "#333333"), "BORDER": 2, "SLIDER_DEPTH": 2, "PROGRESS_DEPTH": 2, }, ) sg.theme("Very Dark") model_candidates = sorted(Path("./logs/44k/").glob("G_*.pth")) frame_contents = { "Paths": [ [ sg.Text("Model path"), sg.Push(), sg.InputText( key="model_path", default_text=(model_candidates[-1].absolute().as_posix() if model_candidates else ""), enable_events=True, ), sg.FileBrowse( initial_folder=(Path("./logs/44k/").absolute if Path("./logs/44k/").exists() else Path(".").absolute().as_posix()), key="model_path_browse", file_types=( ("PyTorch", "G_*.pth G_*.pt"), ("Pytorch", "*.pth *.pt"), ), ), ], [ sg.Text("Config path"), sg.Push(), sg.InputText( key="config_path", default_text=(Path("./configs/44k/config.json").absolute().as_posix() if Path("./configs/44k/config.json").exists() else ""), enable_events=True, ), sg.FileBrowse( initial_folder=(Path("./configs/44k/").as_posix() if Path("./configs/44k/").exists() else Path(".").absolute().as_posix()), key="config_path_browse", file_types=(("JSON", "*.json"),), ), ], [ sg.Text("Cluster model path (Optional)"), sg.Push(), sg.InputText( key="cluster_model_path", default_text=(Path("./logs/44k/kmeans.pt").absolute().as_posix() if Path("./logs/44k/kmeans.pt").exists() else ""), enable_events=True, ), sg.FileBrowse( initial_folder=("./logs/44k/" if Path("./logs/44k/").exists() else "."), key="cluster_model_path_browse", file_types=(("PyTorch", "*.pt"), ("Pickle", "*.pt *.pth *.pkl")), ), ], ], "Common": [ [ sg.Text("Speaker"), sg.Push(), sg.Combo(values=[], key="speaker", size=(20, 1)), ], [ sg.Text("Silence threshold"), sg.Push(), sg.Slider( range=(-60.0, 0), orientation="h", key="silence_threshold", resolution=0.1, ), ], [ sg.Text( "Pitch (12 = 1 octave)\nADJUST THIS based on your voice\nwhen Auto predict F0 is turned off.", size=(None, 4), ), sg.Push(), sg.Slider( range=(-36, 36), orientation="h", key="transpose", tick_interval=12, ), ], [ sg.Checkbox( key="auto_predict_f0", text="Auto predict F0 (Pitch may become unstable when turned on in real-time inference.)", ) ], [ sg.Text("F0 prediction method"), sg.Push(), sg.Combo( ["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], key="f0_method", ), ], [ sg.Text("Cluster infer ratio"), sg.Push(), sg.Slider( range=(0, 1.0), orientation="h", key="cluster_infer_ratio", resolution=0.01, ), ], [ sg.Text("Noise scale"), sg.Push(), sg.Slider( range=(0.0, 1.0), orientation="h", key="noise_scale", resolution=0.01, ), ], [ sg.Text("Pad seconds"), sg.Push(), sg.Slider( range=(0.0, 1.0), orientation="h", key="pad_seconds", resolution=0.01, ), ], [ sg.Text("Chunk seconds"), sg.Push(), sg.Slider( range=(0.0, 3.0), orientation="h", key="chunk_seconds", resolution=0.01, ), ], [ sg.Text("Max chunk seconds (set lower if Out Of Memory, 0 to disable)"), sg.Push(), sg.Slider( range=(0.0, 240.0), orientation="h", key="max_chunk_seconds", resolution=1.0, ), ], [ sg.Checkbox( key="absolute_thresh", text="Absolute threshold (ignored (True) in realtime inference)", ) ], ], "File": [ [ sg.Text("Input audio path"), sg.Push(), sg.InputText(key="input_path", enable_events=True), sg.FileBrowse( initial_folder=".", key="input_path_browse", file_types=(get_supported_file_types_concat() if os.name == "nt" else get_supported_file_types()), ), sg.FolderBrowse( button_text="Browse(Folder)", initial_folder=".", key="input_path_folder_browse", target="input_path", ), sg.Button("Play", key="play_input"), ], [ sg.Text("Output audio path"), sg.Push(), sg.InputText(key="output_path"), sg.FileSaveAs( initial_folder=".", key="output_path_browse", file_types=get_supported_file_types(), ), ], [sg.Checkbox(key="auto_play", text="Auto play", default=True)], ], "Realtime": [ [ sg.Text("Crossfade seconds"), sg.Push(), sg.Slider( range=(0, 0.6), orientation="h", key="crossfade_seconds", resolution=0.001, ), ], [ sg.Text( "Block seconds", # \n(big -> more robust, slower, (the same) latency)" tooltip="Big -> more robust, slower, (the same) latency", ), sg.Push(), sg.Slider( range=(0, 3.0), orientation="h", key="block_seconds", resolution=0.001, ), ], [ sg.Text( "Additional Infer seconds (before)", # \n(big -> more robust, slower)" tooltip="Big -> more robust, slower, additional latency", ), sg.Push(), sg.Slider( range=(0, 2.0), orientation="h", key="additional_infer_before_seconds", resolution=0.001, ), ], [ sg.Text( "Additional Infer seconds (after)", # \n(big -> more robust, slower, additional latency)" tooltip="Big -> more robust, slower, additional latency", ), sg.Push(), sg.Slider( range=(0, 2.0), orientation="h", key="additional_infer_after_seconds", resolution=0.001, ), ], [ sg.Text("Realtime algorithm"), sg.Push(), sg.Combo( ["2 (Divide by speech)", "1 (Divide constantly)"], default_value="1 (Divide constantly)", key="realtime_algorithm", ), ], [ sg.Text("Input device"), sg.Push(), sg.Combo( key="input_device", values=[], size=(60, 1), ), ], [ sg.Text("Output device"), sg.Push(), sg.Combo( key="output_device", values=[], size=(60, 1), ), ], [ sg.Checkbox( "Passthrough original audio (for latency check)", key="passthrough_original", default=False, ), sg.Push(), sg.Button("Refresh devices", key="refresh_devices"), ], [ sg.Frame( "Notes", [ [ sg.Text( "In Realtime Inference:\n" " - Setting F0 prediction method to 'crepe` may cause performance degradation.\n" " - Auto Predict F0 must be turned off.\n" "If the audio sounds mumbly and choppy:\n" " Case: The inference has not been made in time (Increase Block seconds)\n" " Case: Mic input is low (Decrease Silence threshold)\n" ) ] ], ), ], ], "Presets": [ [ sg.Text("Presets"), sg.Push(), sg.Combo( key="presets", values=list(load_presets().keys()), size=(40, 1), enable_events=True, ), sg.Button("Delete preset", key="delete_preset"), ], [ sg.Text("Preset name"), sg.Stretch(), sg.InputText(key="preset_name", size=(26, 1)), sg.Button("Add current settings as a preset", key="add_preset"), ], ], } # frames frames = {} for name, items in frame_contents.items(): frame = sg.Frame(name, items) frame.expand_x = True frames[name] = [frame] bottoms = [ [ sg.Checkbox( key="use_gpu", default=get_optimal_device() != torch.device("cpu"), text="Use GPU" + ( " (not available; if your device has GPU, make sure you installed PyTorch with CUDA support)" if get_optimal_device() == torch.device("cpu") else "" ), disabled=get_optimal_device() == torch.device("cpu"), ) ], [ sg.Button("Infer", key="infer"), sg.Button("(Re)Start Voice Changer", key="start_vc"), sg.Button("Stop Voice Changer", key="stop_vc"), sg.Push(), # sg.Button("ONNX Export", key="onnx_export"), ], ] column1 = sg.Column( [ frames["Paths"], frames["Common"], ], vertical_alignment="top", ) column2 = sg.Column( [ frames["File"], frames["Realtime"], frames["Presets"], ] + bottoms ) # columns layout = [[column1, column2]] # get screen size screen_width, screen_height = sg.Window.get_screen_size() if screen_height < 720: layout = [ [ sg.Column( layout, vertical_alignment="top", scrollable=False, expand_x=True, expand_y=True, vertical_scroll_only=True, key="main_column", ) ] ] window = sg.Window( f"{__name__.split('.')[0].replace('_', '-')} v{__version__}", layout, grab_anywhere=True, finalize=True, scaling=1, font=("Yu Gothic UI", 11) if os.name == "nt" else None, # resizable=True, # size=(1280, 720), # Below disables taskbar, which may be not useful for some users # use_custom_titlebar=True, no_titlebar=False # Keep on top # keep_on_top=True ) # event, values = window.read(timeout=0.01) # window["main_column"].Scrollable = True # make slider height smaller try: for v in window.element_list(): if isinstance(v, sg.Slider): v.Widget.configure(sliderrelief="flat", width=10, sliderlength=20) except Exception as e: LOG.exception(e) # for n in ["input_device", "output_device"]: # window[n].Widget.configure(justify="right") event, values = window.read(timeout=0.01) def update_speaker() -> None: from . import utils config_path = Path(values["config_path"]) if config_path.exists() and config_path.is_file(): hp = utils.get_hparams(values["config_path"]) LOG.debug(f"Loaded config from {values['config_path']}") window["speaker"].update(values=list(hp.__dict__["spk"].keys()), set_to_index=0) def update_devices() -> None: ( input_devices, output_devices, input_device_indices, output_device_indices, ) = get_devices() input_device_indices_reversed = {v: k for k, v in enumerate(input_device_indices)} output_device_indices_reversed = {v: k for k, v in enumerate(output_device_indices)} window["input_device"].update(values=input_devices, value=values["input_device"]) window["output_device"].update(values=output_devices, value=values["output_device"]) input_default, output_default = sd.default.device if values["input_device"] not in input_devices: window["input_device"].update( values=input_devices, set_to_index=input_device_indices_reversed.get(input_default, 0), ) if values["output_device"] not in output_devices: window["output_device"].update( values=output_devices, set_to_index=output_device_indices_reversed.get(output_default, 0), ) PRESET_KEYS = [key for key in values.keys() if not any(exclude in key for exclude in ["preset", "browse"])] def apply_preset(name: str) -> None: for key, value in load_presets()[name].items(): if key in PRESET_KEYS: window[key].update(value) values[key] = value default_name = list(load_presets().keys())[0] apply_preset(default_name) window["presets"].update(default_name) del default_name update_speaker() update_devices() # with ProcessPool(max_workers=1) as pool: # to support Linux with ProcessPool( max_workers=min(2, multiprocessing.cpu_count()), context=multiprocessing.get_context("spawn"), ) as pool: future: None | ProcessFuture = None infer_futures: set[ProcessFuture] = set() while True: event, values = window.read(200) if event == sg.WIN_CLOSED: break if not event == sg.EVENT_TIMEOUT: LOG.info(f"Event {event}, values {values}") if event.endswith("_path"): for name in window.AllKeysDict: if str(name).endswith("_browse"): browser = window[name] if isinstance(browser, sg.Button): LOG.info(f"Updating browser {browser} to {Path(values[event]).parent}") browser.InitialFolder = Path(values[event]).parent browser.update() else: LOG.warning(f"Browser {browser} is not a FileBrowse") window["transpose"].update( disabled=values["auto_predict_f0"], visible=not values["auto_predict_f0"], ) input_path = Path(values["input_path"]) output_path = Path(values["output_path"]) if event == "add_preset": presets = add_preset(values["preset_name"], {key: values[key] for key in PRESET_KEYS}) window["presets"].update(values=list(presets.keys())) elif event == "delete_preset": presets = delete_preset(values["presets"]) window["presets"].update(values=list(presets.keys())) elif event == "presets": apply_preset(values["presets"]) update_speaker() elif event == "refresh_devices": update_devices() elif event == "config_path": update_speaker() elif event == "input_path": # Don't change the output path if it's already set # if values["output_path"]: # continue # Set a sensible default output path window.Element("output_path").Update(str(get_output_path(input_path))) elif event == "infer": if "Default VC" in values["presets"]: window["presets"].update(set_to_index=list(load_presets().keys()).index("Default File")) apply_preset("Default File") if values["input_path"] == "": LOG.warning("Input path is empty.") continue if not input_path.exists(): LOG.warning(f"Input path {input_path} does not exist.") continue # if not validate_output_file_type(output_path): # continue try: from so_vits_svc_fork.inference.main import infer LOG.info("Starting inference...") window["infer"].update(disabled=True) infer_future = pool.schedule( infer, kwargs=dict( # paths model_path=Path(values["model_path"]), output_path=output_path, input_path=input_path, config_path=Path(values["config_path"]), recursive=True, # svc config speaker=values["speaker"], cluster_model_path=(Path(values["cluster_model_path"]) if values["cluster_model_path"] else None), transpose=values["transpose"], auto_predict_f0=values["auto_predict_f0"], cluster_infer_ratio=values["cluster_infer_ratio"], noise_scale=values["noise_scale"], f0_method=values["f0_method"], # slice config db_thresh=values["silence_threshold"], pad_seconds=values["pad_seconds"], chunk_seconds=values["chunk_seconds"], absolute_thresh=values["absolute_thresh"], max_chunk_seconds=values["max_chunk_seconds"], device=("cpu" if not values["use_gpu"] else get_optimal_device()), ), ) infer_future.add_done_callback(lambda _future: after_inference(window, input_path, values["auto_play"], output_path)) infer_futures.add(infer_future) except Exception as e: LOG.exception(e) elif event == "play_input": if Path(values["input_path"]).exists(): pool.schedule(play_audio, args=[Path(values["input_path"])]) elif event == "start_vc": _, _, input_device_indices, output_device_indices = get_devices(update=False) from so_vits_svc_fork.inference.main import realtime if future: LOG.info("Canceling previous task") future.cancel() future = pool.schedule( realtime, kwargs=dict( # paths model_path=Path(values["model_path"]), config_path=Path(values["config_path"]), speaker=values["speaker"], # svc config cluster_model_path=(Path(values["cluster_model_path"]) if values["cluster_model_path"] else None), transpose=values["transpose"], auto_predict_f0=values["auto_predict_f0"], cluster_infer_ratio=values["cluster_infer_ratio"], noise_scale=values["noise_scale"], f0_method=values["f0_method"], # slice config db_thresh=values["silence_threshold"], pad_seconds=values["pad_seconds"], chunk_seconds=values["chunk_seconds"], # realtime config crossfade_seconds=values["crossfade_seconds"], additional_infer_before_seconds=values["additional_infer_before_seconds"], additional_infer_after_seconds=values["additional_infer_after_seconds"], block_seconds=values["block_seconds"], version=int(values["realtime_algorithm"][0]), input_device=input_device_indices[window["input_device"].widget.current()], output_device=output_device_indices[window["output_device"].widget.current()], device=get_optimal_device() if values["use_gpu"] else "cpu", passthrough_original=values["passthrough_original"], ), ) elif event == "stop_vc": if future: future.cancel() future = None elif event == "onnx_export": try: raise NotImplementedError("ONNX export is not implemented yet.") from so_vits_svc_fork.modules.onnx._export import onnx_export onnx_export( input_path=Path(values["model_path"]), output_path=Path(values["model_path"]).with_suffix(".onnx"), config_path=Path(values["config_path"]), device="cpu", ) except Exception as e: LOG.exception(e) if future is not None and future.done(): try: future.result() except Exception as e: LOG.error("Error in realtime: ") LOG.exception(e) future = None for future in copy(infer_futures): if future.done(): try: future.result() except Exception as e: LOG.error("Error in inference: ") LOG.exception(e) infer_futures.remove(future) if future: future.cancel() window.close() ================================================ FILE: src/so_vits_svc_fork/hparams.py ================================================ from __future__ import annotations from typing import Any class HParams: def __init__(self, **kwargs: Any) -> None: for k, v in kwargs.items(): if type(v) == dict: # noqa v = HParams(**v) self[k] = v def keys(self): return self.__dict__.keys() def items(self): return self.__dict__.items() def values(self): return self.__dict__.values() def get(self, key: str, default: Any = None): return self.__dict__.get(key, default) def __len__(self): return len(self.__dict__) def __getitem__(self, key): return getattr(self, key) def __setitem__(self, key, value): return setattr(self, key, value) def __contains__(self, key): return key in self.__dict__ def __repr__(self): return self.__dict__.__repr__() ================================================ FILE: src/so_vits_svc_fork/inference/__init__.py ================================================ ================================================ FILE: src/so_vits_svc_fork/inference/core.py ================================================ from __future__ import annotations from collections.abc import Iterable from copy import deepcopy from logging import getLogger from pathlib import Path from typing import Any, Callable, Literal import attrs import librosa import numpy as np import torch from cm_time import timer from numpy import dtype, float32, ndarray import so_vits_svc_fork.f0 from so_vits_svc_fork import cluster, utils from ..modules.synthesizers import SynthesizerTrn from ..utils import get_optimal_device LOG = getLogger(__name__) def pad_array(array_, target_length: int): current_length = array_.shape[0] if current_length >= target_length: return array_[ (current_length - target_length) // 2 : (current_length - target_length) // 2 + target_length, ..., ] else: pad_width = target_length - current_length pad_left = pad_width // 2 pad_right = pad_width - pad_left padded_arr = np.pad(array_, (pad_left, pad_right), "constant", constant_values=(0, 0)) return padded_arr @attrs.frozen(kw_only=True) class Chunk: is_speech: bool audio: ndarray[Any, dtype[float32]] start: int end: int @property def duration(self) -> float32: # return self.end - self.start return float32(self.audio.shape[0]) def __repr__(self) -> str: return f"Chunk(Speech: {self.is_speech}, {self.duration})" def split_silence( audio: ndarray[Any, dtype[float32]], top_db: int = 40, ref: float | Callable[[ndarray[Any, dtype[float32]]], float] = 1, frame_length: int = 2048, hop_length: int = 512, aggregate: Callable[[ndarray[Any, dtype[float32]]], float] = np.mean, max_chunk_length: int = 0, ) -> Iterable[Chunk]: non_silence_indices = librosa.effects.split( audio, top_db=top_db, ref=ref, frame_length=frame_length, hop_length=hop_length, aggregate=aggregate, ) last_end = 0 for start, end in non_silence_indices: if start != last_end: yield Chunk(is_speech=False, audio=audio[last_end:start], start=last_end, end=start) while max_chunk_length > 0 and end - start > max_chunk_length: yield Chunk( is_speech=True, audio=audio[start : start + max_chunk_length], start=start, end=start + max_chunk_length, ) start += max_chunk_length if end - start > 0: yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end) last_end = end if last_end != len(audio): yield Chunk(is_speech=False, audio=audio[last_end:], start=last_end, end=len(audio)) class Svc: def __init__( self, *, net_g_path: Path | str, config_path: Path | str, device: torch.device | str | None = None, cluster_model_path: Path | str | None = None, half: bool = False, ): self.net_g_path = net_g_path if device is None: self.device = (get_optimal_device(),) else: self.device = torch.device(device) self.hps = utils.get_hparams(config_path) self.target_sample = self.hps.data.sampling_rate self.hop_size = self.hps.data.hop_length self.spk2id = self.hps.spk self.hubert_model = utils.get_hubert_model(self.device, self.hps.data.get("contentvec_final_proj", True)) self.dtype = torch.float16 if half else torch.float32 self.contentvec_final_proj = self.hps.data.__dict__.get("contentvec_final_proj", True) self.load_model() if cluster_model_path is not None and Path(cluster_model_path).exists(): self.cluster_model = cluster.get_cluster_model(cluster_model_path) def load_model(self): self.net_g = SynthesizerTrn( self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, **self.hps.model, ) _ = utils.load_checkpoint(self.net_g_path, self.net_g, None) _ = self.net_g.eval() for m in self.net_g.modules(): utils.remove_weight_norm_if_exists(m) _ = self.net_g.to(self.device, dtype=self.dtype) self.net_g = self.net_g def get_unit_f0( self, audio: ndarray[Any, dtype[float32]], tran: int, cluster_infer_ratio: float, speaker: int | str, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", ): f0 = so_vits_svc_fork.f0.compute_f0( audio, sampling_rate=self.target_sample, hop_length=self.hop_size, method=f0_method, ) f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0) f0 = torch.as_tensor(f0, dtype=self.dtype, device=self.device) uv = torch.as_tensor(uv, dtype=self.dtype, device=self.device) f0 = f0 * 2 ** (tran / 12) f0 = f0.unsqueeze(0) uv = uv.unsqueeze(0) c = utils.get_content( self.hubert_model, audio, self.device, self.target_sample, self.contentvec_final_proj, ).to(self.dtype) c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1]) if cluster_infer_ratio != 0: cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T cluster_c = torch.FloatTensor(cluster_c).to(self.device) c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c c = c.unsqueeze(0) return c, f0, uv def infer( self, speaker: int | str, transpose: int, audio: ndarray[Any, dtype[float32]], cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", ) -> tuple[torch.Tensor, int]: audio = audio.astype(np.float32) # get speaker id if isinstance(speaker, int): if len(self.spk2id.__dict__) >= speaker: speaker_id = speaker else: raise ValueError(f"Speaker id {speaker} >= number of speakers {len(self.spk2id.__dict__)}") else: if speaker in self.spk2id.__dict__: speaker_id = self.spk2id.__dict__[speaker] else: LOG.warning(f"Speaker {speaker} is not found. Use speaker 0 instead.") speaker_id = 0 speaker_candidates = list(filter(lambda x: x[1] == speaker_id, self.spk2id.__dict__.items())) if len(speaker_candidates) > 1: raise ValueError(f"Speaker_id {speaker_id} is not unique. Candidates: {speaker_candidates}") elif len(speaker_candidates) == 0: raise ValueError(f"Speaker_id {speaker_id} is not found.") speaker = speaker_candidates[0][0] sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0) # get unit f0 c, f0, uv = self.get_unit_f0(audio, transpose, cluster_infer_ratio, speaker, f0_method) # inference with torch.no_grad(): with timer() as t: audio = self.net_g.infer( c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noise_scale, )[0, 0].data.float() audio_duration = audio.shape[-1] / self.target_sample LOG.info(f"Inference time: {t.elapsed:.2f}s, RTF: {t.elapsed / audio_duration:.2f}") torch.cuda.empty_cache() return audio, audio.shape[-1] def infer_silence( self, audio: np.ndarray[Any, np.dtype[np.float32]], *, # svc config speaker: int | str, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, max_chunk_seconds: float = 40, # fade_seconds: float = 0.0, ) -> np.ndarray[Any, np.dtype[np.float32]]: sr = self.target_sample result_audio = np.array([], dtype=np.float32) chunk_length_min = chunk_length_min = ( int( min( sr / so_vits_svc_fork.f0.f0_min * 20 + 1, chunk_seconds * sr, ) ) // 2 ) for chunk in split_silence( audio, top_db=-db_thresh, frame_length=chunk_length_min * 2, hop_length=chunk_length_min, ref=1 if absolute_thresh else np.max, max_chunk_length=int(max_chunk_seconds * sr), ): LOG.info(f"Chunk: {chunk}") if not chunk.is_speech: audio_chunk_infer = np.zeros_like(chunk.audio) else: # pad pad_len = int(sr * pad_seconds) audio_chunk_pad = np.concatenate( [ np.zeros([pad_len], dtype=np.float32), chunk.audio, np.zeros([pad_len], dtype=np.float32), ] ) audio_chunk_pad_infer_tensor, _ = self.infer( speaker, transpose, audio_chunk_pad, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, f0_method=f0_method, ) audio_chunk_pad_infer = audio_chunk_pad_infer_tensor.cpu().numpy() pad_len = int(self.target_sample * pad_seconds) cut_len_2 = (len(audio_chunk_pad_infer) - len(chunk.audio)) // 2 audio_chunk_infer = audio_chunk_pad_infer[cut_len_2 : cut_len_2 + len(chunk.audio)] # add fade # fade_len = int(self.target_sample * fade_seconds) # _audio[:fade_len] = _audio[:fade_len] * np.linspace(0, 1, fade_len) # _audio[-fade_len:] = _audio[-fade_len:] * np.linspace(1, 0, fade_len) # empty cache torch.cuda.empty_cache() result_audio = np.concatenate([result_audio, audio_chunk_infer]) result_audio = result_audio[: audio.shape[0]] return result_audio def sola_crossfade( first: ndarray[Any, dtype[float32]], second: ndarray[Any, dtype[float32]], crossfade_len: int, sola_search_len: int, ) -> ndarray[Any, dtype[float32]]: cor_nom = np.convolve( second[: sola_search_len + crossfade_len], np.flip(first[-crossfade_len:]), "valid", ) cor_den = np.sqrt( np.convolve( second[: sola_search_len + crossfade_len] ** 2, np.ones(crossfade_len), "valid", ) + 1e-8 ) sola_shift = np.argmax(cor_nom / cor_den) LOG.info(f"SOLA shift: {sola_shift}") second = second[sola_shift : sola_shift + len(second) - sola_search_len] return np.concatenate( [ first[:-crossfade_len], first[-crossfade_len:] * np.linspace(1, 0, crossfade_len) + second[:crossfade_len] * np.linspace(0, 1, crossfade_len), second[crossfade_len:], ] ) class Crossfader: def __init__( self, *, additional_infer_before_len: int, additional_infer_after_len: int, crossfade_len: int, sola_search_len: int = 384, ) -> None: if additional_infer_before_len < 0: raise ValueError("additional_infer_len must be >= 0") if crossfade_len < 0: raise ValueError("crossfade_len must be >= 0") if additional_infer_after_len < 0: raise ValueError("additional_infer_len must be >= 0") if additional_infer_before_len < 0: raise ValueError("additional_infer_len must be >= 0") self.additional_infer_before_len = additional_infer_before_len self.additional_infer_after_len = additional_infer_after_len self.crossfade_len = crossfade_len self.sola_search_len = sola_search_len self.last_input_left = np.zeros( sola_search_len + crossfade_len + additional_infer_before_len + additional_infer_after_len, dtype=np.float32, ) self.last_infered_left = np.zeros(crossfade_len, dtype=np.float32) def process(self, input_audio: ndarray[Any, dtype[float32]], *args, **kwargs: Any) -> ndarray[Any, dtype[float32]]: """ Chunks : ■■■■■■□□□□□□ add last input:□■■■■■■ ■□□□□□□ infer :□■■■■■■ ■□□□□□□ crossfade :▲■■■■■ ▲□□□□□ """ # check input if input_audio.ndim != 1: raise ValueError("Input audio must be 1-dimensional.") if input_audio.shape[0] + self.additional_infer_before_len <= self.crossfade_len: raise ValueError( f"Input audio length ({input_audio.shape[0]}) + additional_infer_len ({self.additional_infer_before_len}) must be greater than crossfade_len ({self.crossfade_len})." ) input_audio = input_audio.astype(np.float32) input_audio_len = len(input_audio) # concat last input and infer input_audio_concat = np.concatenate([self.last_input_left, input_audio]) del input_audio pad_len = 0 if pad_len: infer_audio_concat = self.infer( np.pad(input_audio_concat, (pad_len, pad_len), mode="reflect"), *args, **kwargs, )[pad_len:-pad_len] else: infer_audio_concat = self.infer(input_audio_concat, *args, **kwargs) # debug SOLA (using copy synthesis with a random shift) """ rs = int(np.random.uniform(-200,200)) LOG.info(f"Debug random shift: {rs}") infer_audio_concat = np.roll(input_audio_concat, rs) """ if len(infer_audio_concat) != len(input_audio_concat): raise ValueError(f"Inferred audio length ({len(infer_audio_concat)}) should be equal to input audio length ({len(input_audio_concat)}).") infer_audio_to_use = infer_audio_concat[ -(self.sola_search_len + self.crossfade_len + input_audio_len + self.additional_infer_after_len) : -self.additional_infer_after_len ] assert len(infer_audio_to_use) == input_audio_len + self.sola_search_len + self.crossfade_len, ( f"{len(infer_audio_to_use)} != {input_audio_len + self.sola_search_len + self.cross_fade_len}" ) _audio = sola_crossfade( self.last_infered_left, infer_audio_to_use, self.crossfade_len, self.sola_search_len, ) result_audio = _audio[: -self.crossfade_len] assert len(result_audio) == input_audio_len, f"{len(result_audio)} != {input_audio_len}" # update last input and inferred self.last_input_left = input_audio_concat[ -(self.sola_search_len + self.crossfade_len + self.additional_infer_before_len + self.additional_infer_after_len) : ] self.last_infered_left = _audio[-self.crossfade_len :] return result_audio def infer(self, input_audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]: return input_audio class RealtimeVC(Crossfader): def __init__( self, *, svc_model: Svc, crossfade_len: int = 3840, additional_infer_before_len: int = 7680, additional_infer_after_len: int = 7680, split: bool = True, ) -> None: self.svc_model = svc_model self.split = split super().__init__( crossfade_len=crossfade_len, additional_infer_before_len=additional_infer_before_len, additional_infer_after_len=additional_infer_after_len, ) def process( self, input_audio: ndarray[Any, dtype[float32]], *args: Any, **kwargs: Any, ) -> ndarray[Any, dtype[float32]]: return super().process(input_audio, *args, **kwargs) def infer( self, input_audio: np.ndarray[Any, np.dtype[np.float32]], # svc config speaker: int | str, transpose: int, cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, ) -> ndarray[Any, dtype[float32]]: # infer if self.split: return self.svc_model.infer_silence( audio=input_audio, speaker=speaker, transpose=transpose, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=True, ) else: rms = np.sqrt(np.mean(input_audio**2)) min_rms = 10 ** (db_thresh / 20) if rms < min_rms: LOG.info(f"Skip silence: RMS={rms:.2f} < {min_rms:.2f}") return np.zeros_like(input_audio) else: LOG.info(f"Start inference: RMS={rms:.2f} >= {min_rms:.2f}") infered_audio_c, _ = self.svc_model.infer( speaker=speaker, transpose=transpose, audio=input_audio, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, f0_method=f0_method, ) return infered_audio_c.cpu().numpy() class RealtimeVC2: chunk_store: list[Chunk] def __init__(self, svc_model: Svc) -> None: self.input_audio_store = np.array([], dtype=np.float32) self.chunk_store = [] self.svc_model = svc_model def process( self, input_audio: np.ndarray[Any, np.dtype[np.float32]], # svc config speaker: int | str, transpose: int, cluster_infer_ratio: float = 0, auto_predict_f0: bool = False, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, chunk_seconds: float = 0.5, ) -> ndarray[Any, dtype[float32]]: def infer(audio: ndarray[Any, dtype[float32]]) -> ndarray[Any, dtype[float32]]: infered_audio_c, _ = self.svc_model.infer( speaker=speaker, transpose=transpose, audio=audio, cluster_infer_ratio=cluster_infer_ratio, auto_predict_f0=auto_predict_f0, noise_scale=noise_scale, f0_method=f0_method, ) return infered_audio_c.cpu().numpy() self.input_audio_store = np.concatenate([self.input_audio_store, input_audio]) LOG.info(f"input_audio_store: {self.input_audio_store.shape}") sr = self.svc_model.target_sample chunk_length_min = int(min(sr / so_vits_svc_fork.f0.f0_min * 20 + 1, chunk_seconds * sr)) // 2 LOG.info(f"Chunk length min: {chunk_length_min}") chunk_list = list( split_silence( self.input_audio_store, -db_thresh, frame_length=chunk_length_min * 2, hop_length=chunk_length_min, ref=1, # use absolute threshold ) ) assert len(chunk_list) > 0 LOG.info(f"Chunk list: {chunk_list}") # do not infer LAST incomplete is_speech chunk and save to store if chunk_list[-1].is_speech: self.input_audio_store = chunk_list.pop().audio else: self.input_audio_store = np.array([], dtype=np.float32) # infer complete is_speech chunk and save to store self.chunk_store.extend([attrs.evolve(c, audio=infer(c.audio) if c.is_speech else c.audio) for c in chunk_list]) # calculate lengths and determine compress rate total_speech_len = sum([c.duration if c.is_speech else 0 for c in self.chunk_store]) total_silence_len = sum([c.duration if not c.is_speech else 0 for c in self.chunk_store]) input_audio_len = input_audio.shape[0] silence_compress_rate = total_silence_len / max(0, input_audio_len - total_speech_len) LOG.info(f"Total speech len: {total_speech_len}, silence len: {total_silence_len}, silence compress rate: {silence_compress_rate}") # generate output audio output_audio = np.array([], dtype=np.float32) break_flag = False LOG.info(f"Chunk store: {self.chunk_store}") for chunk in deepcopy(self.chunk_store): compress_rate = 1 if chunk.is_speech else silence_compress_rate left_len = input_audio_len - output_audio.shape[0] # calculate chunk duration chunk_duration_output = int(min(chunk.duration / compress_rate, left_len)) chunk_duration_input = int(min(chunk.duration, left_len * compress_rate)) LOG.info(f"Chunk duration output: {chunk_duration_output}, input: {chunk_duration_input}, left len: {left_len}") # remove chunk from store self.chunk_store.pop(0) if chunk.duration > chunk_duration_input: left_chunk = attrs.evolve(chunk, audio=chunk.audio[chunk_duration_input:]) chunk = attrs.evolve(chunk, audio=chunk.audio[:chunk_duration_input]) self.chunk_store.insert(0, left_chunk) break_flag = True if chunk.is_speech: # if is_speech, just concat output_audio = np.concatenate([output_audio, chunk.audio]) else: # if is_silence, concat with zeros and compress with silence_compress_rate output_audio = np.concatenate( [ output_audio, np.zeros( chunk_duration_output, dtype=np.float32, ), ] ) if break_flag: break LOG.info(f"Chunk store: {self.chunk_store}, output_audio: {output_audio.shape}") # make same length (errors) output_audio = output_audio[:input_audio_len] output_audio = np.concatenate( [ output_audio, np.zeros(input_audio_len - output_audio.shape[0], dtype=np.float32), ] ) return output_audio ================================================ FILE: src/so_vits_svc_fork/inference/main.py ================================================ from __future__ import annotations from collections.abc import Sequence from logging import getLogger from pathlib import Path from typing import Literal import librosa import numpy as np import soundfile import torch from cm_time import timer from tqdm import tqdm from so_vits_svc_fork.inference.core import RealtimeVC, RealtimeVC2, Svc from so_vits_svc_fork.utils import get_optimal_device LOG = getLogger(__name__) def infer( *, # paths input_path: Path | str | Sequence[Path | str], output_path: Path | str | Sequence[Path | str], model_path: Path | str, config_path: Path | str, recursive: bool = False, # svc config speaker: int | str, cluster_model_path: Path | str | None = None, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, max_chunk_seconds: float = 40, device: str | torch.device = get_optimal_device(), ): if isinstance(input_path, (str, Path)): input_path = [input_path] if isinstance(output_path, (str, Path)): output_path = [output_path] if len(input_path) != len(output_path): raise ValueError(f"input_path and output_path must have same length, but got {len(input_path)} and {len(output_path)}") model_path = Path(model_path) config_path = Path(config_path) output_path = [Path(p) for p in output_path] input_path = [Path(p) for p in input_path] output_paths = [] input_paths = [] for input_path, output_path in zip(input_path, output_path): if input_path.is_dir(): if not recursive: raise ValueError(f"input_path is a directory, but recursive is False: {input_path}") input_paths.extend(list(input_path.rglob("*.*"))) output_paths.extend([output_path / p.relative_to(input_path) for p in input_paths]) continue input_paths.append(input_path) output_paths.append(output_path) cluster_model_path = Path(cluster_model_path) if cluster_model_path else None svc_model = Svc( net_g_path=model_path.as_posix(), config_path=config_path.as_posix(), cluster_model_path=(cluster_model_path.as_posix() if cluster_model_path else None), device=device, ) try: pbar = tqdm(list(zip(input_paths, output_paths)), disable=len(input_paths) == 1) for input_path, output_path in pbar: pbar.set_description(f"{input_path}") try: audio, _ = librosa.load(str(input_path), sr=svc_model.target_sample) except Exception as e: LOG.error(f"Failed to load {input_path}") LOG.exception(e) continue output_path.parent.mkdir(parents=True, exist_ok=True) audio = svc_model.infer_silence( audio.astype(np.float32), speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, max_chunk_seconds=max_chunk_seconds, ) soundfile.write(str(output_path), audio, svc_model.target_sample) finally: del svc_model torch.cuda.empty_cache() def realtime( *, # paths model_path: Path | str, config_path: Path | str, # svc config speaker: str, cluster_model_path: Path | str | None = None, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", # slice config db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, # realtime config crossfade_seconds: float = 0.05, additional_infer_before_seconds: float = 0.2, additional_infer_after_seconds: float = 0.1, block_seconds: float = 0.5, version: int = 2, input_device: int | str | None = None, output_device: int | str | None = None, device: str | torch.device = get_optimal_device(), passthrough_original: bool = False, ): import sounddevice as sd model_path = Path(model_path) config_path = Path(config_path) cluster_model_path = Path(cluster_model_path) if cluster_model_path else None svc_model = Svc( net_g_path=model_path.as_posix(), config_path=config_path.as_posix(), cluster_model_path=(cluster_model_path.as_posix() if cluster_model_path else None), device=device, ) LOG.info("Creating realtime model...") if version == 1: model = RealtimeVC( svc_model=svc_model, crossfade_len=int(crossfade_seconds * svc_model.target_sample), additional_infer_before_len=int(additional_infer_before_seconds * svc_model.target_sample), additional_infer_after_len=int(additional_infer_after_seconds * svc_model.target_sample), ) else: model = RealtimeVC2( svc_model=svc_model, ) # LOG all device info devices = sd.query_devices() LOG.info(f"Device: {devices}") if isinstance(input_device, str): input_device_candidates = [i for i, d in enumerate(devices) if d["name"] == input_device] if len(input_device_candidates) == 0: LOG.warning(f"Input device {input_device} not found, using default") input_device = None else: input_device = input_device_candidates[0] if isinstance(output_device, str): output_device_candidates = [i for i, d in enumerate(devices) if d["name"] == output_device] if len(output_device_candidates) == 0: LOG.warning(f"Output device {output_device} not found, using default") output_device = None else: output_device = output_device_candidates[0] if input_device is None or input_device >= len(devices): input_device = sd.default.device[0] if output_device is None or output_device >= len(devices): output_device = sd.default.device[1] LOG.info(f"Input Device: {devices[input_device]['name']}, Output Device: {devices[output_device]['name']}") # the model RTL is somewhat significantly high only in the first inference # there could be no better way to warm up the model than to do a dummy inference # (there are not differences in the behavior of the model between the first and the later inferences) # so we do a dummy inference to warm up the model (1 second of audio) LOG.info("Warming up the model...") svc_model.infer( speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, audio=np.zeros(svc_model.target_sample, dtype=np.float32), ) def callback( indata: np.ndarray, outdata: np.ndarray, frames: int, time: int, status: sd.CallbackFlags, ) -> None: LOG.debug(f"Frames: {frames}, Status: {status}, Shape: {indata.shape}, Time: {time}") kwargs = dict( input_audio=indata.mean(axis=1).astype(np.float32), # svc config speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, # slice config db_thresh=db_thresh, # pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, ) if version == 1: kwargs["pad_seconds"] = pad_seconds with timer() as t: inference = model.process( **kwargs, ).reshape(-1, 1) if passthrough_original: outdata[:] = (indata + inference) / 2 else: outdata[:] = inference rtf = t.elapsed / block_seconds LOG.info(f"Realtime inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") if rtf > 1: LOG.warning("RTF is too high, consider increasing block_seconds") try: with sd.Stream( device=(input_device, output_device), channels=1, callback=callback, samplerate=svc_model.target_sample, blocksize=int(block_seconds * svc_model.target_sample), latency="low", ) as stream: LOG.info(f"Latency: {stream.latency}") while True: sd.sleep(1000) finally: # del model, svc_model torch.cuda.empty_cache() ================================================ FILE: src/so_vits_svc_fork/logger.py ================================================ import os import sys from logging import DEBUG, INFO, StreamHandler, basicConfig, captureWarnings, getLogger from pathlib import Path from rich.logging import RichHandler LOGGER_INIT = False def init_logger() -> None: global LOGGER_INIT if LOGGER_INIT: return IS_TEST = "test" in Path.cwd().stem package_name = sys.modules[__name__].__package__ basicConfig( level=INFO, format="%(asctime)s %(message)s", datefmt="[%X]", handlers=[ StreamHandler() if is_notebook() else RichHandler(), # FileHandler(f"{package_name}.log"), ], ) if IS_TEST: getLogger(package_name).setLevel(DEBUG) captureWarnings(True) LOGGER_INIT = True def is_notebook(): try: from IPython import get_ipython if "IPKernelApp" not in get_ipython().config: # pragma: no cover raise ImportError("console") return False if "VSCODE_PID" in os.environ: # pragma: no cover raise ImportError("vscode") return False except Exception: return False else: # pragma: no cover return True ================================================ FILE: src/so_vits_svc_fork/modules/__init__.py ================================================ ================================================ FILE: src/so_vits_svc_fork/modules/attentions.py ================================================ import math import torch from torch import nn from torch.nn import functional as F from so_vits_svc_fork.modules import commons from so_vits_svc_fork.modules.modules import LayerNorm class FFT(nn.Module): def __init__( self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0.0, proximal_bias=False, proximal_init=True, **kwargs, ): super().__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.proximal_bias = proximal_bias self.proximal_init = proximal_init self.drop = nn.Dropout(p_dropout) self.self_attn_layers = nn.ModuleList() self.norm_layers_0 = nn.ModuleList() self.ffn_layers = nn.ModuleList() self.norm_layers_1 = nn.ModuleList() for i in range(self.n_layers): self.self_attn_layers.append( MultiHeadAttention( hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init, ) ) self.norm_layers_0.append(LayerNorm(hidden_channels)) self.ffn_layers.append( FFN( hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True, ) ) self.norm_layers_1.append(LayerNorm(hidden_channels)) def forward(self, x, x_mask): """ x: decoder input h: encoder output """ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) x = x * x_mask for i in range(self.n_layers): y = self.self_attn_layers[i](x, x, self_attn_mask) y = self.drop(y) x = self.norm_layers_0[i](x + y) y = self.ffn_layers[i](x, x_mask) y = self.drop(y) x = self.norm_layers_1[i](x + y) x = x * x_mask return x class Encoder(nn.Module): def __init__( self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=4, **kwargs, ): super().__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.window_size = window_size self.drop = nn.Dropout(p_dropout) self.attn_layers = nn.ModuleList() self.norm_layers_1 = nn.ModuleList() self.ffn_layers = nn.ModuleList() self.norm_layers_2 = nn.ModuleList() for i in range(self.n_layers): self.attn_layers.append( MultiHeadAttention( hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size, ) ) self.norm_layers_1.append(LayerNorm(hidden_channels)) self.ffn_layers.append( FFN( hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, ) ) self.norm_layers_2.append(LayerNorm(hidden_channels)) def forward(self, x, x_mask): attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask for i in range(self.n_layers): y = self.attn_layers[i](x, x, attn_mask) y = self.drop(y) x = self.norm_layers_1[i](x + y) y = self.ffn_layers[i](x, x_mask) y = self.drop(y) x = self.norm_layers_2[i](x + y) x = x * x_mask return x class Decoder(nn.Module): def __init__( self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, proximal_bias=False, proximal_init=True, **kwargs, ): super().__init__() self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.proximal_bias = proximal_bias self.proximal_init = proximal_init self.drop = nn.Dropout(p_dropout) self.self_attn_layers = nn.ModuleList() self.norm_layers_0 = nn.ModuleList() self.encdec_attn_layers = nn.ModuleList() self.norm_layers_1 = nn.ModuleList() self.ffn_layers = nn.ModuleList() self.norm_layers_2 = nn.ModuleList() for i in range(self.n_layers): self.self_attn_layers.append( MultiHeadAttention( hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init, ) ) self.norm_layers_0.append(LayerNorm(hidden_channels)) self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) self.norm_layers_1.append(LayerNorm(hidden_channels)) self.ffn_layers.append( FFN( hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True, ) ) self.norm_layers_2.append(LayerNorm(hidden_channels)) def forward(self, x, x_mask, h, h_mask): """ x: decoder input h: encoder output """ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) x = x * x_mask for i in range(self.n_layers): y = self.self_attn_layers[i](x, x, self_attn_mask) y = self.drop(y) x = self.norm_layers_0[i](x + y) y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) y = self.drop(y) x = self.norm_layers_1[i](x + y) y = self.ffn_layers[i](x, x_mask) y = self.drop(y) x = self.norm_layers_2[i](x + y) x = x * x_mask return x class MultiHeadAttention(nn.Module): def __init__( self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False, ): super().__init__() assert channels % n_heads == 0 self.channels = channels self.out_channels = out_channels self.n_heads = n_heads self.p_dropout = p_dropout self.window_size = window_size self.heads_share = heads_share self.block_length = block_length self.proximal_bias = proximal_bias self.proximal_init = proximal_init self.attn = None self.k_channels = channels // n_heads self.conv_q = nn.Conv1d(channels, channels, 1) self.conv_k = nn.Conv1d(channels, channels, 1) self.conv_v = nn.Conv1d(channels, channels, 1) self.conv_o = nn.Conv1d(channels, out_channels, 1) self.drop = nn.Dropout(p_dropout) if window_size is not None: n_heads_rel = 1 if heads_share else n_heads rel_stddev = self.k_channels**-0.5 self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) nn.init.xavier_uniform_(self.conv_q.weight) nn.init.xavier_uniform_(self.conv_k.weight) nn.init.xavier_uniform_(self.conv_v.weight) if proximal_init: with torch.no_grad(): self.conv_k.weight.copy_(self.conv_q.weight) self.conv_k.bias.copy_(self.conv_q.bias) def forward(self, x, c, attn_mask=None): q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) x, self.attn = self.attention(q, k, v, mask=attn_mask) x = self.conv_o(x) return x def attention(self, query, key, value, mask=None): # reshape [b, d, t] -> [b, n_h, t, d_k] b, d, t_s, t_t = (*key.size(), query.size(2)) query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) if self.window_size is not None: assert t_s == t_t, "Relative attention is only available for self-attention." key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) scores_local = self._relative_position_to_absolute_position(rel_logits) scores = scores + scores_local if self.proximal_bias: assert t_s == t_t, "Proximal bias is only available for self-attention." scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) if mask is not None: scores = scores.masked_fill(mask == 0, -1e4) if self.block_length is not None: assert t_s == t_t, "Local attention is only available for self-attention." block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) scores = scores.masked_fill(block_mask == 0, -1e4) p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] p_attn = self.drop(p_attn) output = torch.matmul(p_attn, value) if self.window_size is not None: relative_weights = self._absolute_position_to_relative_position(p_attn) value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] return output, p_attn def _matmul_with_relative_values(self, x, y): """ x: [b, h, l, m] y: [h or 1, m, d] ret: [b, h, l, d] """ ret = torch.matmul(x, y.unsqueeze(0)) return ret def _matmul_with_relative_keys(self, x, y): """ x: [b, h, l, d] y: [h or 1, m, d] ret: [b, h, l, m] """ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) return ret def _get_relative_embeddings(self, relative_embeddings, length): 2 * self.window_size + 1 # Pad first before slice to avoid using cond ops. pad_length = max(length - (self.window_size + 1), 0) slice_start_position = max((self.window_size + 1) - length, 0) slice_end_position = slice_start_position + 2 * length - 1 if pad_length > 0: padded_relative_embeddings = F.pad( relative_embeddings, commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), ) else: padded_relative_embeddings = relative_embeddings used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] return used_relative_embeddings def _relative_position_to_absolute_position(self, x): """ x: [b, h, l, 2*l-1] ret: [b, h, l, l] """ batch, heads, length, _ = x.size() # Concat columns of pad to shift from relative to absolute indexing. x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) # Concat extra elements so to add up to shape (len+1, 2*len-1). x_flat = x.view([batch, heads, length * 2 * length]) x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])) # Reshape and slice out the padded elements. x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :] return x_final def _absolute_position_to_relative_position(self, x): """ x: [b, h, l, l] ret: [b, h, l, 2*l-1] """ batch, heads, length, _ = x.size() # pad along column x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])) x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) # add 0's in the beginning that will skew the elements after reshape x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] return x_final def _attention_bias_proximal(self, length): """ Bias for self-attention to encourage attention to close positions. Args: length: an integer scalar. Returns: a Tensor with shape [1, 1, length, length] """ r = torch.arange(length, dtype=torch.float32) diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) class FFN(nn.Module): def __init__( self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False, ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.filter_channels = filter_channels self.kernel_size = kernel_size self.p_dropout = p_dropout self.activation = activation self.causal = causal if causal: self.padding = self._causal_padding else: self.padding = self._same_padding self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) self.drop = nn.Dropout(p_dropout) def forward(self, x, x_mask): x = self.conv_1(self.padding(x * x_mask)) if self.activation == "gelu": x = x * torch.sigmoid(1.702 * x) else: x = torch.relu(x) x = self.drop(x) x = self.conv_2(self.padding(x * x_mask)) return x * x_mask def _causal_padding(self, x): if self.kernel_size == 1: return x pad_l = self.kernel_size - 1 pad_r = 0 padding = [[0, 0], [0, 0], [pad_l, pad_r]] x = F.pad(x, commons.convert_pad_shape(padding)) return x def _same_padding(self, x): if self.kernel_size == 1: return x pad_l = (self.kernel_size - 1) // 2 pad_r = self.kernel_size // 2 padding = [[0, 0], [0, 0], [pad_l, pad_r]] x = F.pad(x, commons.convert_pad_shape(padding)) return x ================================================ FILE: src/so_vits_svc_fork/modules/commons.py ================================================ from __future__ import annotations import torch import torch.nn.functional as F from torch import Tensor def slice_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: if length is None: return x length = min(length, x.size(-1)) x_slice = torch.zeros((x.size()[:-1] + (length,)), dtype=x.dtype, device=x.device) ends = starts + length for i, (start, end) in enumerate(zip(starts, ends)): # LOG.debug(i, start, end, x.size(), x[i, ..., start:end].size(), x_slice.size()) # x_slice[i, ...] = x[i, ..., start:end] need to pad # x_slice[i, ..., :end - start] = x[i, ..., start:end] this does not work x_slice[i, ...] = F.pad(x[i, ..., start:end], (0, max(0, length - x.size(-1)))) return x_slice def rand_slice_segments_with_pitch(x: Tensor, f0: Tensor, x_lengths: Tensor | int | None, segment_size: int | None): if segment_size is None: return x, f0, torch.arange(x.size(0), device=x.device) if x_lengths is None: x_lengths = x.size(-1) * torch.ones(x.size(0), dtype=torch.long, device=x.device) # slice_starts = (torch.rand(z.size(0), device=z.device) * (z_lengths - segment_size)).long() slice_starts = (torch.rand(x.size(0), device=x.device) * torch.max(x_lengths - segment_size, torch.zeros_like(x_lengths, device=x.device))).long() z_slice = slice_segments(x, slice_starts, segment_size) f0_slice = slice_segments(f0, slice_starts, segment_size) return z_slice, f0_slice, slice_starts def slice_2d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: batch_size, num_features, seq_len = x.shape ends = starts + length idxs = torch.arange(seq_len, device=x.device).unsqueeze(0).unsqueeze(1).repeat(batch_size, num_features, 1) mask = (idxs >= starts.unsqueeze(-1).unsqueeze(-1)) & (idxs < ends.unsqueeze(-1).unsqueeze(-1)) return x[mask].reshape(batch_size, num_features, length) def slice_1d_segments(x: Tensor, starts: Tensor, length: int) -> Tensor: batch_size, seq_len = x.shape ends = starts + length idxs = torch.arange(seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1) mask = (idxs >= starts.unsqueeze(-1)) & (idxs < ends.unsqueeze(-1)) return x[mask].reshape(batch_size, length) def _slice_segments_v3(x: Tensor, starts: Tensor, length: int) -> Tensor: shape = x.shape[:-1] + (length,) ends = starts + length idxs = torch.arange(x.shape[-1], device=x.device).unsqueeze(0).unsqueeze(0) unsqueeze_dims = len(shape) - len(x.shape) # calculate number of dimensions to unsqueeze starts = starts.reshape(starts.shape + (1,) * unsqueeze_dims) ends = ends.reshape(ends.shape + (1,) * unsqueeze_dims) mask = (idxs >= starts) & (idxs < ends) return x[mask].reshape(shape) def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) def get_padding(kernel_size, dilation=1): return int((kernel_size * dilation - dilation) / 2) def convert_pad_shape(pad_shape): l = pad_shape[::-1] pad_shape = [item for sublist in l for item in sublist] return pad_shape def subsequent_mask(length): mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) return mask @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): n_channels_int = n_channels[0] in_act = input_a + input_b t_act = torch.tanh(in_act[:, :n_channels_int, :]) s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) acts = t_act * s_act return acts def sequence_mask(length, max_length=None): if max_length is None: max_length = length.max() x = torch.arange(max_length, dtype=length.dtype, device=length.device) return x.unsqueeze(0) < length.unsqueeze(1) def clip_grad_value_(parameters, clip_value, norm_type=2): if isinstance(parameters, torch.Tensor): parameters = [parameters] parameters = list(filter(lambda p: p.grad is not None, parameters)) norm_type = float(norm_type) if clip_value is not None: clip_value = float(clip_value) total_norm = 0 for p in parameters: param_norm = p.grad.data.norm(norm_type) total_norm += param_norm.item() ** norm_type if clip_value is not None: p.grad.data.clamp_(min=-clip_value, max=clip_value) total_norm = total_norm ** (1.0 / norm_type) return total_norm ================================================ FILE: src/so_vits_svc_fork/modules/decoders/__init__.py ================================================ ================================================ FILE: src/so_vits_svc_fork/modules/decoders/f0.py ================================================ import torch from torch import nn from so_vits_svc_fork.modules import attentions as attentions class F0Decoder(nn.Module): def __init__( self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, spk_channels=0, ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.spk_channels = spk_channels self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1) self.decoder = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1) self.cond = nn.Conv1d(spk_channels, hidden_channels, 1) def forward(self, x, norm_f0, x_mask, spk_emb=None): x = torch.detach(x) if spk_emb is not None: spk_emb = torch.detach(spk_emb) x = x + self.cond(spk_emb) x += self.f0_prenet(norm_f0) x = self.prenet(x) * x_mask x = self.decoder(x * x_mask, x_mask) x = self.proj(x) * x_mask return x ================================================ FILE: src/so_vits_svc_fork/modules/decoders/hifigan/__init__.py ================================================ from ._models import NSFHifiGANGenerator __all__ = ["NSFHifiGANGenerator"] ================================================ FILE: src/so_vits_svc_fork/modules/decoders/hifigan/_models.py ================================================ from logging import getLogger import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import Conv1d, ConvTranspose1d from torch.nn.utils import remove_weight_norm, weight_norm from ...modules import ResBlock1, ResBlock2 from ._utils import init_weights LOG = getLogger(__name__) LRELU_SLOPE = 0.1 def padDiff(x): return F.pad(F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0) class SineGen(torch.nn.Module): """ Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, voiced_threshold = 0, flag_for_pulse=False) samp_rate: sampling rate in Hz harmonic_num: number of harmonic overtones (default 0) sine_amp: amplitude of sine-wavefrom (default 0.1) noise_std: std of Gaussian noise (default 0.003) voiced_thoreshold: F0 threshold for U/V classification (default 0) flag_for_pulse: this SinGen is used inside PulseGen (default False) Note: when flag_for_pulse is True, the first time step of a voiced segment is always sin(np.pi) or cos(0) """ def __init__( self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False, ): super().__init__() self.sine_amp = sine_amp self.noise_std = noise_std self.harmonic_num = harmonic_num self.dim = self.harmonic_num + 1 self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold self.flag_for_pulse = flag_for_pulse def _f02uv(self, f0): # generate uv signal uv = (f0 > self.voiced_threshold).type(torch.float32) return uv def _f02sine(self, f0_values): """ f0_values: (batchsize, length, dim) where dim indicates fundamental tone and overtones """ # convert to F0 in rad. The integer part n can be ignored # because 2 * np.pi * n doesn't affect phase rad_values = (f0_values / self.sampling_rate) % 1 # initial phase noise (no noise for fundamental component) rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device) rand_ini[:, 0] = 0 rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) if not self.flag_for_pulse: # for normal case # To prevent torch.cumsum numerical overflow, # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1. # Buffer tmp_over_one_idx indicates the time step to add -1. # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi tmp_over_one = torch.cumsum(rad_values, 1) % 1 tmp_over_one_idx = (padDiff(tmp_over_one)) < 0 cumsum_shift = torch.zeros_like(rad_values) cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) else: # If necessary, make sure that the first time step of every # voiced segments is sin(pi) or cos(0) # This is used for pulse-train generation # identify the last time step in unvoiced segments uv = self._f02uv(f0_values) uv_1 = torch.roll(uv, shifts=-1, dims=1) uv_1[:, -1, :] = 1 u_loc = (uv < 1) * (uv_1 > 0) # get the instantanouse phase tmp_cumsum = torch.cumsum(rad_values, dim=1) # different batch needs to be processed differently for idx in range(f0_values.shape[0]): temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :] temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :] # stores the accumulation of i.phase within # each voiced segments tmp_cumsum[idx, :, :] = 0 tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum # rad_values - tmp_cumsum: remove the accumulation of i.phase # within the previous voiced segment. i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1) # get the sines sines = torch.cos(i_phase * 2 * np.pi) return sines def forward(self, f0): """ sine_tensor, uv = forward(f0) input F0: tensor(batchsize=1, length, dim=1) f0 for unvoiced steps should be 0 output sine_tensor: tensor(batchsize=1, length, dim) output uv: tensor(batchsize=1, length, 1) """ with torch.no_grad(): # f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component # fn = torch.multiply( # f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device) # ) fn = torch.multiply(f0, torch.arange(1, self.harmonic_num + 2).to(f0.device).to(f0.dtype)) # generate sine waveforms sine_waves = self._f02sine(fn) * self.sine_amp # generate uv signal # uv = torch.ones(f0.shape) # uv = uv * (f0 > self.voiced_threshold) uv = self._f02uv(f0) # noise: for unvoiced should be similar to sine_amp # std = self.sine_amp/3 -> max value ~ self.sine_amp # . for voiced regions is self.noise_std noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) # first: set the unvoiced part to 0 by uv # then: additive noise sine_waves = sine_waves * uv + noise return sine_waves, uv, noise class SourceModuleHnNSF(torch.nn.Module): """ SourceModule for hn-nsf SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0) sampling_rate: sampling_rate in Hz harmonic_num: number of harmonic above F0 (default: 0) sine_amp: amplitude of sine source signal (default: 0.1) add_noise_std: std of additive Gaussian noise (default: 0.003) note that amplitude of noise in unvoiced is decided by sine_amp voiced_threshold: threshold to set U/V given F0 (default: 0) Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) F0_sampled (batchsize, length, 1) Sine_source (batchsize, length, 1) noise_source (batchsize, length 1) uv (batchsize, length, 1) """ def __init__( self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0, ): super().__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std # to produce sine waveforms self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod) # to merge source harmonics into a single excitation self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() def forward(self, x): """ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) F0_sampled (batchsize, length, 1) Sine_source (batchsize, length, 1) noise_source (batchsize, length 1) """ # source for harmonic branch sine_wavs, uv, _ = self.l_sin_gen(x) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) # source for noise branch, in the same shape as uv noise = torch.randn_like(uv) * self.sine_amp / 3 return sine_merge, noise, uv class NSFHifiGANGenerator(torch.nn.Module): def __init__(self, h): super().__init__() self.h = h self.num_kernels = len(h["resblock_kernel_sizes"]) self.num_upsamples = len(h["upsample_rates"]) self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"])) self.m_source = SourceModuleHnNSF(sampling_rate=h["sampling_rate"], harmonic_num=8) self.noise_convs = nn.ModuleList() self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3)) resblock = ResBlock1 if h["resblock"] == "1" else ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])): c_cur = h["upsample_initial_channel"] // (2 ** (i + 1)) self.ups.append( weight_norm( ConvTranspose1d( h["upsample_initial_channel"] // (2**i), h["upsample_initial_channel"] // (2 ** (i + 1)), k, u, padding=(k - u) // 2, ) ) ) if i + 1 < len(h["upsample_rates"]): # stride_f0 = np.prod(h["upsample_rates"][i + 1 :]) self.noise_convs.append( Conv1d( 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2, ) ) else: self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = h["upsample_initial_channel"] // (2 ** (i + 1)) for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])): self.resblocks.append(resblock(ch, k, d)) self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.cond = nn.Conv1d(h["gin_channels"], h["upsample_initial_channel"], 1) def forward(self, x, f0, g=None): # LOG.info(1,x.shape,f0.shape,f0[:, None].shape) f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t # LOG.info(2,f0.shape) har_source, noi_source, uv = self.m_source(f0) har_source = har_source.transpose(1, 2) x = self.conv_pre(x) x = x + self.cond(g) # LOG.info(124,x.shape,har_source.shape) for i in range(self.num_upsamples): x = F.leaky_relu(x, LRELU_SLOPE) # LOG.info(3,x.shape) x = self.ups[i](x) x_source = self.noise_convs[i](har_source) # LOG.info(4,x_source.shape,har_source.shape,x.shape) x = x + x_source xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) return x def remove_weight_norm(self): LOG.info("Removing weight norm...") for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() remove_weight_norm(self.conv_pre) remove_weight_norm(self.conv_post) ================================================ FILE: src/so_vits_svc_fork/modules/decoders/hifigan/_utils.py ================================================ from logging import getLogger # matplotlib.use("Agg") LOG = getLogger(__name__) def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) def get_padding(kernel_size, dilation=1): return int((kernel_size * dilation - dilation) / 2) ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/__init__.py ================================================ from ._generators import ( Multiband_iSTFT_Generator, Multistream_iSTFT_Generator, iSTFT_Generator, ) from ._loss import subband_stft_loss from ._pqmf import PQMF __all__ = [ "PQMF", "Multiband_iSTFT_Generator", "Multistream_iSTFT_Generator", "iSTFT_Generator", "subband_stft_loss", ] ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/_generators.py ================================================ import math import torch from torch import nn from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, weight_norm from ....modules import modules from ....modules.commons import get_padding, init_weights from ._pqmf import PQMF from ._stft import TorchSTFT class iSTFT_Generator(torch.nn.Module): def __init__( self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, gin_channels=0, ): super().__init__() # self.h = h self.gen_istft_n_fft = gen_istft_n_fft self.gen_istft_hop_size = gen_istft_hop_size self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): self.ups.append( weight_norm( ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2, ) ) ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.post_n_fft = self.gen_istft_n_fft self.conv_post = weight_norm(Conv1d(ch, self.post_n_fft + 2, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) self.reflection_pad = torch.nn.ReflectionPad1d((1, 0)) self.stft = TorchSTFT( filter_length=self.gen_istft_n_fft, hop_length=self.gen_istft_hop_size, win_length=self.gen_istft_n_fft, ) def forward(self, x, g=None): x = self.conv_pre(x) for i in range(self.num_upsamples): x = F.leaky_relu(x, modules.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.reflection_pad(x) x = self.conv_post(x) spec = torch.exp(x[:, : self.post_n_fft // 2 + 1, :]) phase = math.pi * torch.sin(x[:, self.post_n_fft // 2 + 1 :, :]) out = self.stft.inverse(spec, phase).to(x.device) return out, None def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() remove_weight_norm(self.conv_pre) remove_weight_norm(self.conv_post) class Multiband_iSTFT_Generator(torch.nn.Module): def __init__( self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=0, ): super().__init__() # self.h = h self.subbands = subbands self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): self.ups.append( weight_norm( ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2, ) ) ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.post_n_fft = gen_istft_n_fft self.ups.apply(init_weights) self.reflection_pad = torch.nn.ReflectionPad1d((1, 0)) self.reshape_pixelshuffle = [] self.subband_conv_post = weight_norm(Conv1d(ch, self.subbands * (self.post_n_fft + 2), 7, 1, padding=3)) self.subband_conv_post.apply(init_weights) self.gen_istft_n_fft = gen_istft_n_fft self.gen_istft_hop_size = gen_istft_hop_size def forward(self, x, g=None): stft = TorchSTFT( filter_length=self.gen_istft_n_fft, hop_length=self.gen_istft_hop_size, win_length=self.gen_istft_n_fft, ).to(x.device) pqmf = PQMF(x.device, subbands=self.subbands).to(x.device, dtype=x.dtype) x = self.conv_pre(x) # [B, ch, length] for i in range(self.num_upsamples): x = F.leaky_relu(x, modules.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.reflection_pad(x) x = self.subband_conv_post(x) x = torch.reshape(x, (x.shape[0], self.subbands, x.shape[1] // self.subbands, x.shape[-1])) spec = torch.exp(x[:, :, : self.post_n_fft // 2 + 1, :]) phase = math.pi * torch.sin(x[:, :, self.post_n_fft // 2 + 1 :, :]) y_mb_hat = stft.inverse( torch.reshape( spec, ( spec.shape[0] * self.subbands, self.gen_istft_n_fft // 2 + 1, spec.shape[-1], ), ), torch.reshape( phase, ( phase.shape[0] * self.subbands, self.gen_istft_n_fft // 2 + 1, phase.shape[-1], ), ), ) y_mb_hat = torch.reshape(y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1])) y_mb_hat = y_mb_hat.squeeze(-2) y_g_hat = pqmf.synthesis(y_mb_hat) return y_g_hat, y_mb_hat def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() class Multistream_iSTFT_Generator(torch.nn.Module): def __init__( self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gen_istft_n_fft, gen_istft_hop_size, subbands, gin_channels=0, ): super().__init__() # self.h = h self.subbands = subbands self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)) resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 self.ups = nn.ModuleList() for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): self.ups.append( weight_norm( ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2, ) ) ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = upsample_initial_channel // (2 ** (i + 1)) for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): self.resblocks.append(resblock(ch, k, d)) self.post_n_fft = gen_istft_n_fft self.ups.apply(init_weights) self.reflection_pad = torch.nn.ReflectionPad1d((1, 0)) self.reshape_pixelshuffle = [] self.subband_conv_post = weight_norm(Conv1d(ch, self.subbands * (self.post_n_fft + 2), 7, 1, padding=3)) self.subband_conv_post.apply(init_weights) self.gen_istft_n_fft = gen_istft_n_fft self.gen_istft_hop_size = gen_istft_hop_size updown_filter = torch.zeros((self.subbands, self.subbands, self.subbands)).float() for k in range(self.subbands): updown_filter[k, k, 0] = 1.0 self.register_buffer("updown_filter", updown_filter) self.multistream_conv_post = weight_norm(Conv1d(self.subbands, 1, kernel_size=63, bias=False, padding=get_padding(63, 1))) self.multistream_conv_post.apply(init_weights) def forward(self, x, g=None): stft = TorchSTFT( filter_length=self.gen_istft_n_fft, hop_length=self.gen_istft_hop_size, win_length=self.gen_istft_n_fft, ).to(x.device) # pqmf = PQMF(x.device) x = self.conv_pre(x) # [B, ch, length] for i in range(self.num_upsamples): x = F.leaky_relu(x, modules.LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.reflection_pad(x) x = self.subband_conv_post(x) x = torch.reshape(x, (x.shape[0], self.subbands, x.shape[1] // self.subbands, x.shape[-1])) spec = torch.exp(x[:, :, : self.post_n_fft // 2 + 1, :]) phase = math.pi * torch.sin(x[:, :, self.post_n_fft // 2 + 1 :, :]) y_mb_hat = stft.inverse( torch.reshape( spec, ( spec.shape[0] * self.subbands, self.gen_istft_n_fft // 2 + 1, spec.shape[-1], ), ), torch.reshape( phase, ( phase.shape[0] * self.subbands, self.gen_istft_n_fft // 2 + 1, phase.shape[-1], ), ), ) y_mb_hat = torch.reshape(y_mb_hat, (x.shape[0], self.subbands, 1, y_mb_hat.shape[-1])) y_mb_hat = y_mb_hat.squeeze(-2) y_mb_hat = F.conv_transpose1d( y_mb_hat, self.updown_filter.to(x.device) * self.subbands, stride=self.subbands, ) y_g_hat = self.multistream_conv_post(y_mb_hat) return y_g_hat, y_mb_hat def remove_weight_norm(self): print("Removing weight norm...") for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/_loss.py ================================================ from ._stft_loss import MultiResolutionSTFTLoss def subband_stft_loss(h, y_mb, y_hat_mb): sub_stft_loss = MultiResolutionSTFTLoss(h.train.fft_sizes, h.train.hop_sizes, h.train.win_lengths) y_mb = y_mb.view(-1, y_mb.size(2)) y_hat_mb = y_hat_mb.view(-1, y_hat_mb.size(2)) sub_sc_loss, sub_mag_loss = sub_stft_loss(y_hat_mb[:, : y_mb.size(-1)], y_mb) return sub_sc_loss + sub_mag_loss ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/_pqmf.py ================================================ # Copyright 2020 Tomoki Hayashi # MIT License (https://opensource.org/licenses/MIT) """Pseudo QMF modules.""" import numpy as np import torch import torch.nn.functional as F from scipy.signal.windows import kaiser def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0): """ Design prototype filter for PQMF. This method is based on `A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`_. Args: taps (int): The number of filter taps. cutoff_ratio (float): Cut-off frequency ratio. beta (float): Beta coefficient for kaiser window. Returns: ndarray: Impluse response of prototype filter (taps + 1,). .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`: https://ieeexplore.ieee.org/abstract/document/681427 """ # check the arguments are valid assert taps % 2 == 0, "The number of taps mush be even number." assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0." # make initial filter omega_c = np.pi * cutoff_ratio with np.errstate(invalid="ignore"): h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) / (np.pi * (np.arange(taps + 1) - 0.5 * taps)) h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form # apply kaiser window w = kaiser(taps + 1, beta) h = h_i * w return h class PQMF(torch.nn.Module): """ PQMF module. This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_. .. _`Near-perfect-reconstruction pseudo-QMF banks`: https://ieeexplore.ieee.org/document/258122 """ def __init__(self, device, subbands=8, taps=62, cutoff_ratio=0.15, beta=9.0): """ Initialize PQMF module. Args: subbands (int): The number of subbands. taps (int): The number of filter taps. cutoff_ratio (float): Cut-off frequency ratio. beta (float): Beta coefficient for kaiser window. """ super().__init__() # define filter coefficient h_proto = design_prototype_filter(taps, cutoff_ratio, beta) h_analysis = np.zeros((subbands, len(h_proto))) h_synthesis = np.zeros((subbands, len(h_proto))) for k in range(subbands): h_analysis[k] = ( 2 * h_proto * np.cos((2 * k + 1) * (np.pi / (2 * subbands)) * (np.arange(taps + 1) - ((taps - 1) / 2)) + (-1) ** k * np.pi / 4) ) h_synthesis[k] = ( 2 * h_proto * np.cos((2 * k + 1) * (np.pi / (2 * subbands)) * (np.arange(taps + 1) - ((taps - 1) / 2)) - (-1) ** k * np.pi / 4) ) # convert to tensor analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1).to(device) synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0).to(device) # register coefficients as buffer self.register_buffer("analysis_filter", analysis_filter) self.register_buffer("synthesis_filter", synthesis_filter) # filter for downsampling & upsampling updown_filter = torch.zeros((subbands, subbands, subbands)).float().to(device) for k in range(subbands): updown_filter[k, k, 0] = 1.0 self.register_buffer("updown_filter", updown_filter) self.subbands = subbands # keep padding info self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0) def analysis(self, x): """ Analysis with PQMF. Args: x (Tensor): Input tensor (B, 1, T). Returns: Tensor: Output tensor (B, subbands, T // subbands). """ x = F.conv1d(self.pad_fn(x), self.analysis_filter) return F.conv1d(x, self.updown_filter, stride=self.subbands) def synthesis(self, x): """ Synthesis with PQMF. Args: x (Tensor): Input tensor (B, subbands, T // subbands). Returns: Tensor: Output tensor (B, 1, T). """ # NOTE(kan-bayashi): Power will be dreased so here multiply by # subbands. # Not sure this is the correct way, it is better to check again. # TODO(kan-bayashi): Understand the reconstruction procedure x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands) return F.conv1d(self.pad_fn(x), self.synthesis_filter) ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/_stft.py ================================================ """ BSD 3-Clause License Copyright (c) 2017, Prem Seetharaman All rights reserved. * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import librosa.util as librosa_util import numpy as np import torch import torch.nn.functional as F from librosa.util import pad_center, tiny from scipy.signal import get_window from torch.autograd import Variable def window_sumsquare( window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None, ): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] return x class STFT(torch.nn.Module): """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" def __init__(self, filter_length=800, hop_length=200, win_length=800, window="hann"): super().__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int(self.filter_length / 2 + 1) fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :]) if window is not None: assert filter_length >= win_length # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer("forward_basis", forward_basis.float()) self.register_buffer("inverse_basis", inverse_basis.float()) def transform(self, input_data): num_batches = input_data.size(0) num_samples = input_data.size(1) self.num_samples = num_samples # similar to librosa, reflect-pad the input input_data = input_data.view(num_batches, 1, num_samples) input_data = F.pad( input_data.unsqueeze(1), (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), mode="reflect", ) input_data = input_data.squeeze(1) forward_transform = F.conv1d( input_data, Variable(self.forward_basis, requires_grad=False), stride=self.hop_length, padding=0, ) cutoff = int((self.filter_length / 2) + 1) real_part = forward_transform[:, :cutoff, :] imag_part = forward_transform[:, cutoff:, :] magnitude = torch.sqrt(real_part**2 + imag_part**2) phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data)) return magnitude, phase def inverse(self, magnitude, phase): recombine_magnitude_phase = torch.cat([magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1) inverse_transform = F.conv_transpose1d( recombine_magnitude_phase, Variable(self.inverse_basis, requires_grad=False), stride=self.hop_length, padding=0, ) if self.window is not None: window_sum = window_sumsquare( self.window, magnitude.size(-1), hop_length=self.hop_length, win_length=self.win_length, n_fft=self.filter_length, dtype=np.float32, ) # remove modulation effects approx_nonzero_indices = torch.from_numpy(np.where(window_sum > tiny(window_sum))[0]) window_sum = torch.autograd.Variable(torch.from_numpy(window_sum), requires_grad=False) window_sum = window_sum.to(inverse_transform.device()) inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] # scale by hop ratio inverse_transform *= float(self.filter_length) / self.hop_length inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :] inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :] return inverse_transform def forward(self, input_data): self.magnitude, self.phase = self.transform(input_data) reconstruction = self.inverse(self.magnitude, self.phase) return reconstruction class TorchSTFT(torch.nn.Module): def __init__(self, filter_length=800, hop_length=200, win_length=800, window="hann"): super().__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = torch.from_numpy(get_window(window, win_length, fftbins=True).astype(np.float32)) def transform(self, input_data): forward_transform = torch.stft( input_data, self.filter_length, self.hop_length, self.win_length, window=self.window, return_complex=True, ) return torch.abs(forward_transform), torch.angle(forward_transform) def inverse(self, magnitude, phase): inverse_transform = torch.istft( magnitude * torch.exp(phase * 1j), self.filter_length, self.hop_length, self.win_length, window=self.window.to(magnitude.device), ) return inverse_transform.unsqueeze(-2) # unsqueeze to stay consistent with conv_transpose1d implementation def forward(self, input_data): self.magnitude, self.phase = self.transform(input_data) reconstruction = self.inverse(self.magnitude, self.phase) return reconstruction ================================================ FILE: src/so_vits_svc_fork/modules/decoders/mb_istft/_stft_loss.py ================================================ # Copyright 2019 Tomoki Hayashi # MIT License (https://opensource.org/licenses/MIT) """STFT-based Loss modules.""" import torch import torch.nn.functional as F def stft(x, fft_size, hop_size, win_length, window): """ Perform STFT and convert to magnitude spectrogram. Args: x (Tensor): Input signal tensor (B, T). fft_size (int): FFT size. hop_size (int): Hop size. win_length (int): Window length. window (str): Window function type. Returns: Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1). """ x_stft = torch.stft(x, fft_size, hop_size, win_length, window.to(x.device), return_complex=False) real = x_stft[..., 0] imag = x_stft[..., 1] # NOTE(kan-bayashi): clamp is needed to avoid nan or inf return torch.sqrt(torch.clamp(real**2 + imag**2, min=1e-7)).transpose(2, 1) class SpectralConvergengeLoss(torch.nn.Module): """Spectral convergence loss module.""" def __init__(self): """Initialize spectral convergence loss module.""" super().__init__() def forward(self, x_mag, y_mag): """ Calculate forward propagation. Args: x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Spectral convergence loss value. """ return torch.norm(y_mag - x_mag) / torch.norm(y_mag) # MB-iSTFT-VITS changed here due to codespell class LogSTFTMagnitudeLoss(torch.nn.Module): """Log STFT magnitude loss module.""" def __init__(self): """Initialize los STFT magnitude loss module.""" super().__init__() def forward(self, x_mag, y_mag): """ Calculate forward propagation. Args: x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins). y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins). Returns: Tensor: Log STFT magnitude loss value. """ return F.l1_loss(torch.log(y_mag), torch.log(x_mag)) class STFTLoss(torch.nn.Module): """STFT loss module.""" def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"): """Initialize STFT loss module.""" super().__init__() self.fft_size = fft_size self.shift_size = shift_size self.win_length = win_length self.window = getattr(torch, window)(win_length) self.spectral_convergenge_loss = SpectralConvergengeLoss() self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss() def forward(self, x, y): """ Calculate forward propagation. Args: x (Tensor): Predicted signal (B, T). y (Tensor): Groundtruth signal (B, T). Returns: Tensor: Spectral convergence loss value. Tensor: Log STFT magnitude loss value. """ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window) y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window) sc_loss = self.spectral_convergenge_loss(x_mag, y_mag) mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag) return sc_loss, mag_loss class MultiResolutionSTFTLoss(torch.nn.Module): """Multi resolution STFT loss module.""" def __init__( self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window", ): """ Initialize Multi resolution STFT loss module. Args: fft_sizes (list): List of FFT sizes. hop_sizes (list): List of hop sizes. win_lengths (list): List of window lengths. window (str): Window function type. """ super().__init__() assert len(fft_sizes) == len(hop_sizes) == len(win_lengths) self.stft_losses = torch.nn.ModuleList() for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths): self.stft_losses += [STFTLoss(fs, ss, wl, window)] def forward(self, x, y): """ Calculate forward propagation. Args: x (Tensor): Predicted signal (B, T). y (Tensor): Groundtruth signal (B, T). Returns: Tensor: Multi resolution spectral convergence loss value. Tensor: Multi resolution log STFT magnitude loss value. """ sc_loss = 0.0 mag_loss = 0.0 for f in self.stft_losses: sc_l, mag_l = f(x, y) sc_loss += sc_l mag_loss += mag_l sc_loss /= len(self.stft_losses) mag_loss /= len(self.stft_losses) return sc_loss, mag_loss ================================================ FILE: src/so_vits_svc_fork/modules/descriminators.py ================================================ import torch from torch import nn from torch.nn import AvgPool1d, Conv1d, Conv2d from torch.nn import functional as F from torch.nn.utils import spectral_norm, weight_norm from so_vits_svc_fork.modules import modules as modules from so_vits_svc_fork.modules.commons import get_padding class DiscriminatorP(torch.nn.Module): def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): super().__init__() self.period = period self.use_spectral_norm = use_spectral_norm norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList( [ norm_f( Conv2d( 1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0), ) ), norm_f( Conv2d( 32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0), ) ), norm_f( Conv2d( 128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0), ) ), norm_f( Conv2d( 512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0), ) ), norm_f( Conv2d( 1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0), ) ), ] ) self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) def forward(self, x): fmap = [] # 1d to 2d b, c, t = x.shape if t % self.period != 0: # pad first n_pad = self.period - (t % self.period) x = F.pad(x, (0, n_pad), "reflect") t = t + n_pad x = x.view(b, c, t // self.period, self.period) for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class DiscriminatorS(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() norm_f = weight_norm if use_spectral_norm == False else spectral_norm self.convs = nn.ModuleList( [ norm_f(Conv1d(1, 16, 15, 1, padding=7)), norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), ] ) self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) def forward(self, x): fmap = [] for l in self.convs: x = l(x) x = F.leaky_relu(x, modules.LRELU_SLOPE) fmap.append(x) x = self.conv_post(x) fmap.append(x) x = torch.flatten(x, 1, -1) return x, fmap class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super().__init__() periods = [2, 3, 5, 7, 11] discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] self.discriminators = nn.ModuleList(discs) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) y_d_gs.append(y_d_g) fmap_rs.append(fmap_r) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs class MultiScaleDiscriminator(torch.nn.Module): def __init__(self): super().__init__() self.discriminators = nn.ModuleList( [ DiscriminatorS(use_spectral_norm=True), DiscriminatorS(), DiscriminatorS(), ] ) self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)]) def forward(self, y, y_hat): y_d_rs = [] y_d_gs = [] fmap_rs = [] fmap_gs = [] for i, d in enumerate(self.discriminators): if i != 0: y = self.meanpools[i - 1](y) y_hat = self.meanpools[i - 1](y_hat) y_d_r, fmap_r = d(y) y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) fmap_rs.append(fmap_r) y_d_gs.append(y_d_g) fmap_gs.append(fmap_g) return y_d_rs, y_d_gs, fmap_rs, fmap_gs ================================================ FILE: src/so_vits_svc_fork/modules/encoders.py ================================================ import torch from torch import nn from so_vits_svc_fork.modules import attentions as attentions from so_vits_svc_fork.modules import commons as commons from so_vits_svc_fork.modules import modules as modules class SpeakerEncoder(torch.nn.Module): def __init__( self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256, ): super().__init__() self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True) self.linear = nn.Linear(model_hidden_size, model_embedding_size) self.relu = nn.ReLU() def forward(self, mels): self.lstm.flatten_parameters() _, (hidden, _) = self.lstm(mels) embeds_raw = self.relu(self.linear(hidden[-1])) return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True) def compute_partial_slices(self, total_frames, partial_frames, partial_hop): mel_slices = [] for i in range(0, total_frames - partial_frames, partial_hop): mel_range = torch.arange(i, i + partial_frames) mel_slices.append(mel_range) return mel_slices def embed_utterance(self, mel, partial_frames=128, partial_hop=64): mel_len = mel.size(1) last_mel = mel[:, -partial_frames:] if mel_len > partial_frames: mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop) mels = list(mel[:, s] for s in mel_slices) mels.append(last_mel) mels = torch.stack(tuple(mels), 0).squeeze(1) with torch.no_grad(): partial_embeds = self(mels) embed = torch.mean(partial_embeds, axis=0).unsqueeze(0) # embed = embed / torch.linalg.norm(embed, 2) else: with torch.no_grad(): embed = self(last_mel) return embed class Encoder(nn.Module): def __init__( self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.pre = nn.Conv1d(in_channels, hidden_channels, 1) self.enc = modules.WN( hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, x, x_lengths, g=None): # print(x.shape,x_lengths.shape) x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) x = self.pre(x) * x_mask x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask return z, m, logs, x_mask class TextEncoder(nn.Module): def __init__( self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None, ): super().__init__() self.out_channels = out_channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.n_layers = n_layers self.gin_channels = gin_channels self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) self.f0_emb = nn.Embedding(256, hidden_channels) self.enc_ = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout) def forward(self, x, x_mask, f0=None, noice_scale=1): x = x + self.f0_emb(f0).transpose(1, 2) x = self.enc_(x * x_mask, x_mask) stats = self.proj(x) * x_mask m, logs = torch.split(stats, self.out_channels, dim=1) z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask return z, m, logs, x_mask ================================================ FILE: src/so_vits_svc_fork/modules/flows.py ================================================ from torch import nn from so_vits_svc_fork.modules import modules as modules class ResidualCouplingBlock(nn.Module): def __init__( self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0, ): super().__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.n_flows = n_flows self.gin_channels = gin_channels self.flows = nn.ModuleList() for i in range(n_flows): self.flows.append( modules.ResidualCouplingLayer( channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True, ) ) self.flows.append(modules.Flip()) def forward(self, x, x_mask, g=None, reverse=False): if not reverse: for flow in self.flows: x, _ = flow(x, x_mask, g=g, reverse=reverse) else: for flow in reversed(self.flows): x = flow(x, x_mask, g=g, reverse=reverse) return x ================================================ FILE: src/so_vits_svc_fork/modules/losses.py ================================================ import torch def feature_loss(fmap_r, fmap_g): loss = 0 for dr, dg in zip(fmap_r, fmap_g): for rl, gl in zip(dr, dg): rl = rl.float().detach() gl = gl.float() loss += torch.mean(torch.abs(rl - gl)) return loss * 2 def discriminator_loss(disc_real_outputs, disc_generated_outputs): loss = 0 r_losses = [] g_losses = [] for dr, dg in zip(disc_real_outputs, disc_generated_outputs): dr = dr.float() dg = dg.float() r_loss = torch.mean((1 - dr) ** 2) g_loss = torch.mean(dg**2) loss += r_loss + g_loss r_losses.append(r_loss.item()) g_losses.append(g_loss.item()) return loss, r_losses, g_losses def generator_loss(disc_outputs): loss = 0 gen_losses = [] for dg in disc_outputs: dg = dg.float() l = torch.mean((1 - dg) ** 2) gen_losses.append(l) loss += l return loss, gen_losses def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): """ z_p, logs_q: [b, h, t_t] m_p, logs_p: [b, h, t_t] """ z_p = z_p.float() logs_q = logs_q.float() m_p = m_p.float() logs_p = logs_p.float() z_mask = z_mask.float() # print(logs_p) kl = logs_p - logs_q - 0.5 kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) kl = torch.sum(kl * z_mask) l = kl / torch.sum(z_mask) return l ================================================ FILE: src/so_vits_svc_fork/modules/mel_processing.py ================================================ """ from logging import getLogger import torch import torch.utils.data import torchaudio LOG = getLogger(__name__) from ..hparams import HParams def spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor: return torchaudio.transforms.Spectrogram( n_fft=hps.data.filter_length, win_length=hps.data.win_length, hop_length=hps.data.hop_length, power=1.0, window_fn=torch.hann_window, normalized=False, ).to(audio.device)(audio) def spec_to_mel_torch(spec: torch.Tensor, hps: HParams) -> torch.Tensor: return torchaudio.transforms.MelScale( n_mels=hps.data.n_mel_channels, sample_rate=hps.data.sampling_rate, f_min=hps.data.mel_fmin, f_max=hps.data.mel_fmax, ).to(spec.device)(spec) def mel_spectrogram_torch(audio: torch.Tensor, hps: HParams) -> torch.Tensor: return torchaudio.transforms.MelSpectrogram( sample_rate=hps.data.sampling_rate, n_fft=hps.data.filter_length, n_mels=hps.data.n_mel_channels, win_length=hps.data.win_length, hop_length=hps.data.hop_length, f_min=hps.data.mel_fmin, f_max=hps.data.mel_fmax, power=1.0, window_fn=torch.hann_window, normalized=False, ).to(audio.device)(audio) """ from logging import getLogger import torch import torch.utils.data from librosa.filters import mel as librosa_mel_fn LOG = getLogger(__name__) MAX_WAV_VALUE = 32768.0 def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): """ PARAMS ------ C: compression factor """ return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression_torch(x, C=1): """ PARAMS ------ C: compression factor used to compress """ return torch.exp(x) / C def spectral_normalize_torch(magnitudes): output = dynamic_range_compression_torch(magnitudes) return output def spectral_de_normalize_torch(magnitudes): output = dynamic_range_decompression_torch(magnitudes) return output mel_basis = {} hann_window = {} def spectrogram_torch(y, hps, center=False): if torch.min(y) < -1.0: LOG.info("min value is ", torch.min(y)) if torch.max(y) > 1.0: LOG.info("max value is ", torch.max(y)) n_fft = hps.data.filter_length hop_size = hps.data.hop_length win_size = hps.data.win_length global hann_window dtype_device = str(y.dtype) + "_" + str(y.device) wnsize_dtype_device = str(win_size) + "_" + dtype_device if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect", ) y = y.squeeze(1) spec = torch.stft( y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=False, ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) return spec def spec_to_mel_torch(spec, hps): sampling_rate = hps.data.sampling_rate n_fft = hps.data.filter_length num_mels = hps.data.n_mel_channels fmin = hps.data.mel_fmin fmax = hps.data.mel_fmax global mel_basis dtype_device = str(spec.dtype) + "_" + str(spec.device) fmax_dtype_device = str(fmax) + "_" + dtype_device if fmax_dtype_device not in mel_basis: mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) return spec def mel_spectrogram_torch(y, hps, center=False): sampling_rate = hps.data.sampling_rate n_fft = hps.data.filter_length num_mels = hps.data.n_mel_channels fmin = hps.data.mel_fmin fmax = hps.data.mel_fmax hop_size = hps.data.hop_length win_size = hps.data.win_length if torch.min(y) < -1.0: LOG.info(f"min value is {torch.min(y)}") if torch.max(y) > 1.0: LOG.info(f"max value is {torch.max(y)}") global mel_basis, hann_window dtype_device = str(y.dtype) + "_" + str(y.device) fmax_dtype_device = str(fmax) + "_" + dtype_device wnsize_dtype_device = str(win_size) + "_" + dtype_device if fmax_dtype_device not in mel_basis: mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) y = torch.nn.functional.pad( y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect", ) y = y.squeeze(1) spec = torch.stft( y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=False, ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) spec = torch.matmul(mel_basis[fmax_dtype_device], spec) spec = spectral_normalize_torch(spec) return spec ================================================ FILE: src/so_vits_svc_fork/modules/modules.py ================================================ import torch from torch import nn from torch.nn import Conv1d from torch.nn import functional as F from torch.nn.utils import remove_weight_norm, weight_norm from so_vits_svc_fork.modules import commons from so_vits_svc_fork.modules.commons import get_padding, init_weights LRELU_SLOPE = 0.1 class LayerNorm(nn.Module): def __init__(self, channels, eps=1e-5): super().__init__() self.channels = channels self.eps = eps self.gamma = nn.Parameter(torch.ones(channels)) self.beta = nn.Parameter(torch.zeros(channels)) def forward(self, x): x = x.transpose(1, -1) x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) return x.transpose(1, -1) class ConvReluNorm(nn.Module): def __init__( self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout, ): super().__init__() self.in_channels = in_channels self.hidden_channels = hidden_channels self.out_channels = out_channels self.kernel_size = kernel_size self.n_layers = n_layers self.p_dropout = p_dropout assert n_layers > 1, "Number of layers should be larger than 0." self.conv_layers = nn.ModuleList() self.norm_layers = nn.ModuleList() self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)) self.norm_layers.append(LayerNorm(hidden_channels)) self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) for _ in range(n_layers - 1): self.conv_layers.append( nn.Conv1d( hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2, ) ) self.norm_layers.append(LayerNorm(hidden_channels)) self.proj = nn.Conv1d(hidden_channels, out_channels, 1) self.proj.weight.data.zero_() self.proj.bias.data.zero_() def forward(self, x, x_mask): x_org = x for i in range(self.n_layers): x = self.conv_layers[i](x * x_mask) x = self.norm_layers[i](x) x = self.relu_drop(x) x = x_org + self.proj(x) return x * x_mask class DDSConv(nn.Module): """ Dialted and Depth-Separable Convolution """ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): super().__init__() self.channels = channels self.kernel_size = kernel_size self.n_layers = n_layers self.p_dropout = p_dropout self.drop = nn.Dropout(p_dropout) self.convs_sep = nn.ModuleList() self.convs_1x1 = nn.ModuleList() self.norms_1 = nn.ModuleList() self.norms_2 = nn.ModuleList() for i in range(n_layers): dilation = kernel_size**i padding = (kernel_size * dilation - dilation) // 2 self.convs_sep.append( nn.Conv1d( channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding, ) ) self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) self.norms_1.append(LayerNorm(channels)) self.norms_2.append(LayerNorm(channels)) def forward(self, x, x_mask, g=None): if g is not None: x = x + g for i in range(self.n_layers): y = self.convs_sep[i](x * x_mask) y = self.norms_1[i](y) y = F.gelu(y) y = self.convs_1x1[i](y) y = self.norms_2[i](y) y = F.gelu(y) y = self.drop(y) x = x + y return x * x_mask class WN(torch.nn.Module): def __init__( self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0, ): super().__init__() assert kernel_size % 2 == 1 self.hidden_channels = hidden_channels self.kernel_size = (kernel_size,) self.dilation_rate = dilation_rate self.n_layers = n_layers self.gin_channels = gin_channels self.p_dropout = p_dropout self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() self.drop = nn.Dropout(p_dropout) if gin_channels != 0: cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") for i in range(n_layers): dilation = dilation_rate**i padding = int((kernel_size * dilation - dilation) / 2) in_layer = torch.nn.Conv1d( hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding, ) in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) # last one is not necessary if i < n_layers - 1: res_skip_channels = 2 * hidden_channels else: res_skip_channels = hidden_channels res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") self.res_skip_layers.append(res_skip_layer) def forward(self, x, x_mask, g=None, **kwargs): output = torch.zeros_like(x) n_channels_tensor = torch.IntTensor([self.hidden_channels]) if g is not None: g = self.cond_layer(g) for i in range(self.n_layers): x_in = self.in_layers[i](x) if g is not None: cond_offset = i * 2 * self.hidden_channels g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] else: g_l = torch.zeros_like(x_in) acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = self.drop(acts) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: res_acts = res_skip_acts[:, : self.hidden_channels, :] x = (x + res_acts) * x_mask output = output + res_skip_acts[:, self.hidden_channels :, :] else: output = output + res_skip_acts return output * x_mask def remove_weight_norm(self): if self.gin_channels != 0: torch.nn.utils.remove_weight_norm(self.cond_layer) for l in self.in_layers: torch.nn.utils.remove_weight_norm(l) for l in self.res_skip_layers: torch.nn.utils.remove_weight_norm(l) class ResBlock1(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super().__init__() self.convs1 = nn.ModuleList( [ weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]), ) ), ] ) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList( [ weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), ] ) self.convs2.apply(init_weights) def forward(self, x, x_mask=None): for c1, c2 in zip(self.convs1, self.convs2): xt = F.leaky_relu(x, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c1(xt) xt = F.leaky_relu(xt, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c2(xt) x = xt + x if x_mask is not None: x = x * x_mask return x def remove_weight_norm(self): for l in self.convs1: remove_weight_norm(l) for l in self.convs2: remove_weight_norm(l) class ResBlock2(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3)): super().__init__() self.convs = nn.ModuleList( [ weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]), ) ), ] ) self.convs.apply(init_weights) def forward(self, x, x_mask=None): for c in self.convs: xt = F.leaky_relu(x, LRELU_SLOPE) if x_mask is not None: xt = xt * x_mask xt = c(xt) x = xt + x if x_mask is not None: x = x * x_mask return x def remove_weight_norm(self): for l in self.convs: remove_weight_norm(l) class Log(nn.Module): def forward(self, x, x_mask, reverse=False, **kwargs): if not reverse: y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask logdet = torch.sum(-y, [1, 2]) return y, logdet else: x = torch.exp(x) * x_mask return x class Flip(nn.Module): def forward(self, x, *args, reverse=False, **kwargs): x = torch.flip(x, [1]) if not reverse: logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) return x, logdet else: return x class ElementwiseAffine(nn.Module): def __init__(self, channels): super().__init__() self.channels = channels self.m = nn.Parameter(torch.zeros(channels, 1)) self.logs = nn.Parameter(torch.zeros(channels, 1)) def forward(self, x, x_mask, reverse=False, **kwargs): if not reverse: y = self.m + torch.exp(self.logs) * x y = y * x_mask logdet = torch.sum(self.logs * x_mask, [1, 2]) return y, logdet else: x = (x - self.m) * torch.exp(-self.logs) * x_mask return x class ResidualCouplingLayer(nn.Module): def __init__( self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False, ): assert channels % 2 == 0, "channels should be divisible by 2" super().__init__() self.channels = channels self.hidden_channels = hidden_channels self.kernel_size = kernel_size self.dilation_rate = dilation_rate self.n_layers = n_layers self.half_channels = channels // 2 self.mean_only = mean_only self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) self.enc = WN( hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels, ) self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) self.post.weight.data.zero_() self.post.bias.data.zero_() def forward(self, x, x_mask, g=None, reverse=False): x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) * x_mask h = self.enc(h, x_mask, g=g) stats = self.post(h) * x_mask if not self.mean_only: m, logs = torch.split(stats, [self.half_channels] * 2, 1) else: m = stats logs = torch.zeros_like(m) if not reverse: x1 = m + x1 * torch.exp(logs) * x_mask x = torch.cat([x0, x1], 1) logdet = torch.sum(logs, [1, 2]) return x, logdet else: x1 = (x1 - m) * torch.exp(-logs) * x_mask x = torch.cat([x0, x1], 1) return x ================================================ FILE: src/so_vits_svc_fork/modules/synthesizers.py ================================================ import warnings from collections.abc import Sequence from logging import getLogger from typing import Any, Literal import torch from torch import nn import so_vits_svc_fork.f0 from so_vits_svc_fork.f0 import f0_to_coarse from so_vits_svc_fork.modules import commons as commons from so_vits_svc_fork.modules.decoders.f0 import F0Decoder from so_vits_svc_fork.modules.decoders.hifigan import NSFHifiGANGenerator from so_vits_svc_fork.modules.decoders.mb_istft import ( Multiband_iSTFT_Generator, Multistream_iSTFT_Generator, iSTFT_Generator, ) from so_vits_svc_fork.modules.encoders import Encoder, TextEncoder from so_vits_svc_fork.modules.flows import ResidualCouplingBlock LOG = getLogger(__name__) class SynthesizerTrn(nn.Module): """ Synthesizer for Training """ def __init__( self, spec_channels: int, segment_size: int, inter_channels: int, hidden_channels: int, filter_channels: int, n_heads: int, n_layers: int, kernel_size: int, p_dropout: int, resblock: str, resblock_kernel_sizes: Sequence[int], resblock_dilation_sizes: Sequence[Sequence[int]], upsample_rates: Sequence[int], upsample_initial_channel: int, upsample_kernel_sizes: Sequence[int], gin_channels: int, ssl_dim: int, n_speakers: int, sampling_rate: int = 44100, type_: Literal["hifi-gan", "istft", "ms-istft", "mb-istft"] = "hifi-gan", gen_istft_n_fft: int = 16, gen_istft_hop_size: int = 4, subbands: int = 4, **kwargs: Any, ): super().__init__() self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels self.filter_channels = filter_channels self.n_heads = n_heads self.n_layers = n_layers self.kernel_size = kernel_size self.p_dropout = p_dropout self.resblock = resblock self.resblock_kernel_sizes = resblock_kernel_sizes self.resblock_dilation_sizes = resblock_dilation_sizes self.upsample_rates = upsample_rates self.upsample_initial_channel = upsample_initial_channel self.upsample_kernel_sizes = upsample_kernel_sizes self.segment_size = segment_size self.gin_channels = gin_channels self.ssl_dim = ssl_dim self.n_speakers = n_speakers self.sampling_rate = sampling_rate self.type_ = type_ self.gen_istft_n_fft = gen_istft_n_fft self.gen_istft_hop_size = gen_istft_hop_size self.subbands = subbands if kwargs: warnings.warn(f"Unused arguments: {kwargs}") self.emb_g = nn.Embedding(n_speakers, gin_channels) if ssl_dim is None: self.pre = nn.LazyConv1d(hidden_channels, kernel_size=5, padding=2) else: self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2) self.enc_p = TextEncoder( inter_channels, hidden_channels, filter_channels=filter_channels, n_heads=n_heads, n_layers=n_layers, kernel_size=kernel_size, p_dropout=p_dropout, ) LOG.info(f"Decoder type: {type_}") if type_ == "hifi-gan": hps = { "sampling_rate": sampling_rate, "inter_channels": inter_channels, "resblock": resblock, "resblock_kernel_sizes": resblock_kernel_sizes, "resblock_dilation_sizes": resblock_dilation_sizes, "upsample_rates": upsample_rates, "upsample_initial_channel": upsample_initial_channel, "upsample_kernel_sizes": upsample_kernel_sizes, "gin_channels": gin_channels, } self.dec = NSFHifiGANGenerator(h=hps) self.mb = False else: hps = { "initial_channel": inter_channels, "resblock": resblock, "resblock_kernel_sizes": resblock_kernel_sizes, "resblock_dilation_sizes": resblock_dilation_sizes, "upsample_rates": upsample_rates, "upsample_initial_channel": upsample_initial_channel, "upsample_kernel_sizes": upsample_kernel_sizes, "gin_channels": gin_channels, "gen_istft_n_fft": gen_istft_n_fft, "gen_istft_hop_size": gen_istft_hop_size, "subbands": subbands, } # gen_istft_n_fft, gen_istft_hop_size, subbands if type_ == "istft": del hps["subbands"] self.dec = iSTFT_Generator(**hps) elif type_ == "ms-istft": self.dec = Multistream_iSTFT_Generator(**hps) elif type_ == "mb-istft": self.dec = Multiband_iSTFT_Generator(**hps) else: raise ValueError(f"Unknown type: {type_}") self.mb = True self.enc_q = Encoder( spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels, ) self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels) self.f0_decoder = F0Decoder( 1, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, spk_channels=gin_channels, ) self.emb_uv = nn.Embedding(2, hidden_channels) def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None): g = self.emb_g(g).transpose(1, 2) # ssl prenet x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) # f0 predict lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500 norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv) pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) # encoder z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0)) z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g) # flow z_p = self.flow(z, spec_mask, g=g) z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size) # MB-iSTFT-VITS if self.mb: o, o_mb = self.dec(z_slice, g=g) # HiFi-GAN else: o = self.dec(z_slice, g=g, f0=pitch_slice) o_mb = None return ( o, o_mb, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0, ) def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False): c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device) g = self.emb_g(g).transpose(1, 2) x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype) x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) if predict_f0: lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500 norm_lf0 = so_vits_svc_fork.f0.normalize_f0(lf0, x_mask, uv, random_scale=False) pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g) f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1) z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale) z = self.flow(z_p, c_mask, g=g, reverse=True) # MB-iSTFT-VITS if self.mb: o, o_mb = self.dec(z * c_mask, g=g) else: o = self.dec(z * c_mask, g=g, f0=f0) return o ================================================ FILE: src/so_vits_svc_fork/preprocessing/__init__.py ================================================ ================================================ FILE: src/so_vits_svc_fork/preprocessing/config_templates/__init__.py ================================================ ================================================ FILE: src/so_vits_svc_fork/preprocessing/config_templates/quickvc.json ================================================ { "train": { "log_interval": 100, "eval_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 16, "fp16_run": false, "bf16_run": false, "lr_decay": 0.999875, "segment_size": 10240, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "use_sr": true, "max_speclen": 512, "port": "8001", "keep_ckpts": 3, "fft_sizes": [768, 1366, 342], "hop_sizes": [60, 120, 20], "win_lengths": [300, 600, 120], "window": "hann_window", "num_workers": 4, "log_version": 0, "ckpt_name_by_step": false, "accumulate_grad_batches": 1 }, "data": { "training_files": "filelists/44k/train.txt", "validation_files": "filelists/44k/val.txt", "max_wav_value": 32768.0, "sampling_rate": 44100, "filter_length": 2048, "hop_length": 512, "win_length": 2048, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": 22050, "contentvec_final_proj": false }, "model": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [ [1, 3, 5], [1, 3, 5], [1, 3, 5] ], "upsample_rates": [8, 4], "upsample_initial_channel": 512, "upsample_kernel_sizes": [32, 16], "n_layers_q": 3, "use_spectral_norm": false, "gin_channels": 256, "ssl_dim": 768, "n_speakers": 200, "type_": "ms-istft", "gen_istft_n_fft": 16, "gen_istft_hop_size": 4, "subbands": 4, "pretrained": { "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth", "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth" } }, "spk": {} } ================================================ FILE: src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1-legacy.json ================================================ { "train": { "log_interval": 200, "eval_interval": 800, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 16, "fp16_run": false, "bf16_run": false, "lr_decay": 0.999875, "segment_size": 10240, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "use_sr": true, "max_speclen": 512, "port": "8001", "keep_ckpts": 3, "num_workers": 4, "log_version": 0, "ckpt_name_by_step": false, "accumulate_grad_batches": 1 }, "data": { "training_files": "filelists/44k/train.txt", "validation_files": "filelists/44k/val.txt", "max_wav_value": 32768.0, "sampling_rate": 44100, "filter_length": 2048, "hop_length": 512, "win_length": 2048, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": 22050 }, "model": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [ [1, 3, 5], [1, 3, 5], [1, 3, 5] ], "upsample_rates": [8, 8, 2, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4, 4], "n_layers_q": 3, "use_spectral_norm": false, "gin_channels": 256, "ssl_dim": 256, "n_speakers": 200, "pretrained": { "D_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth", "G_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth" } }, "spk": {} } ================================================ FILE: src/so_vits_svc_fork/preprocessing/config_templates/so-vits-svc-4.0v1.json ================================================ { "train": { "log_interval": 100, "eval_interval": 200, "seed": 1234, "epochs": 10000, "learning_rate": 0.0001, "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 16, "fp16_run": false, "bf16_run": false, "lr_decay": 0.999875, "segment_size": 10240, "init_lr_ratio": 1, "warmup_epochs": 0, "c_mel": 45, "c_kl": 1.0, "use_sr": true, "max_speclen": 512, "port": "8001", "keep_ckpts": 3, "num_workers": 4, "log_version": 0, "ckpt_name_by_step": false, "accumulate_grad_batches": 1 }, "data": { "training_files": "filelists/44k/train.txt", "validation_files": "filelists/44k/val.txt", "max_wav_value": 32768.0, "sampling_rate": 44100, "filter_length": 2048, "hop_length": 512, "win_length": 2048, "n_mel_channels": 80, "mel_fmin": 0.0, "mel_fmax": 22050, "contentvec_final_proj": false }, "model": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 768, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "resblock": "1", "resblock_kernel_sizes": [3, 7, 11], "resblock_dilation_sizes": [ [1, 3, 5], [1, 3, 5], [1, 3, 5] ], "upsample_rates": [8, 8, 2, 2, 2], "upsample_initial_channel": 512, "upsample_kernel_sizes": [16, 16, 4, 4, 4], "n_layers_q": 3, "use_spectral_norm": false, "gin_channels": 256, "ssl_dim": 768, "n_speakers": 200, "type_": "hifi-gan", "pretrained": { "D_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_D_320000.pth", "G_0.pth": "https://huggingface.co/datasets/ms903/sovits4.0-768vec-layer12/resolve/main/sovits_768l12_pre_large_320k/clean_G_320000.pth" } }, "spk": {} } ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_classify.py ================================================ from __future__ import annotations from logging import getLogger from pathlib import Path import keyboard import librosa import sounddevice as sd import soundfile as sf from rich.console import Console from tqdm.rich import tqdm LOG = getLogger(__name__) def preprocess_classify(input_dir: Path | str, output_dir: Path | str, create_new: bool = True) -> None: # paths input_dir_ = Path(input_dir) output_dir_ = Path(output_dir) speed = 1 if not input_dir_.is_dir(): raise ValueError(f"{input_dir} is not a directory.") output_dir_.mkdir(exist_ok=True) console = Console() # get audio paths and folders audio_paths = list(input_dir_.glob("*.*")) last_folders = [x for x in output_dir_.glob("*") if x.is_dir()] console.print("Press ↑ or ↓ to change speed. Press any other key to classify.") console.print(f"Folders: {[x.name for x in last_folders]}") pbar_description = "" pbar = tqdm(audio_paths) for audio_path in pbar: # read file audio, sr = sf.read(audio_path) # update description duration = librosa.get_duration(y=audio, sr=sr) pbar_description = f"{duration:.1f} {pbar_description}" pbar.set_description(pbar_description) while True: # start playing sd.play(librosa.effects.time_stretch(audio, rate=speed), sr, loop=True) # wait for key press key = str(keyboard.read_key()) if key == "down": speed /= 1.1 console.print(f"Speed: {speed:.2f}") elif key == "up": speed *= 1.1 console.print(f"Speed: {speed:.2f}") else: break # stop playing sd.stop() # print if folder changed folders = [x for x in output_dir_.glob("*") if x.is_dir()] if folders != last_folders: console.print(f"Folders updated: {[x.name for x in folders]}") last_folders = folders # get folder folder_candidates = [x for x in folders if x.name.startswith(key)] if len(folder_candidates) == 0: if create_new: folder = output_dir_ / key else: console.print(f"No folder starts with {key}.") continue else: if len(folder_candidates) > 1: LOG.warning( f"Multiple folders ({[x.name for x in folder_candidates]}) start with {key}. Using first one ({folder_candidates[0].name})." ) folder = folder_candidates[0] folder.mkdir(exist_ok=True) # move file new_path = folder / audio_path.name audio_path.rename(new_path) # update description pbar_description = f"Last: {audio_path.name} -> {folder.name}" # yield result # yield audio_path, key, folder, new_path ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_flist_config.py ================================================ from __future__ import annotations import json import os from copy import deepcopy from logging import getLogger from pathlib import Path import numpy as np from librosa import get_duration from tqdm import tqdm LOG = getLogger(__name__) CONFIG_TEMPLATE_DIR = Path(__file__).parent / "config_templates" def preprocess_config( input_dir: Path | str, train_list_path: Path | str, val_list_path: Path | str, test_list_path: Path | str, config_path: Path | str, config_name: str, ): input_dir = Path(input_dir) train_list_path = Path(train_list_path) val_list_path = Path(val_list_path) test_list_path = Path(test_list_path) config_path = Path(config_path) train = [] val = [] test = [] spk_dict = {} spk_id = 0 random = np.random.RandomState(1234) for speaker in os.listdir(input_dir): spk_dict[speaker] = spk_id spk_id += 1 paths = [] for path in tqdm(list((input_dir / speaker).rglob("*.wav"))): if get_duration(filename=path) < 0.3: LOG.warning(f"skip {path} because it is too short.") continue paths.append(path) random.shuffle(paths) if len(paths) <= 4: raise ValueError(f"too few files in {input_dir / speaker} (expected at least 5).") train += paths[2:-2] val += paths[:2] test += paths[-2:] LOG.info(f"Writing {train_list_path}") train_list_path.parent.mkdir(parents=True, exist_ok=True) train_list_path.write_text("\n".join([x.as_posix() for x in train]), encoding="utf-8") LOG.info(f"Writing {val_list_path}") val_list_path.parent.mkdir(parents=True, exist_ok=True) val_list_path.write_text("\n".join([x.as_posix() for x in val]), encoding="utf-8") LOG.info(f"Writing {test_list_path}") test_list_path.parent.mkdir(parents=True, exist_ok=True) test_list_path.write_text("\n".join([x.as_posix() for x in test]), encoding="utf-8") config = deepcopy( json.loads((CONFIG_TEMPLATE_DIR / (config_name if config_name.endswith(".json") else config_name + ".json")).read_text(encoding="utf-8")) ) config["spk"] = spk_dict config["data"]["training_files"] = train_list_path.as_posix() config["data"]["validation_files"] = val_list_path.as_posix() LOG.info(f"Writing {config_path}") config_path.parent.mkdir(parents=True, exist_ok=True) with config_path.open("w", encoding="utf-8") as f: json.dump(config, f, indent=2) ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_hubert_f0.py ================================================ from __future__ import annotations from collections.abc import Iterable from logging import getLogger from pathlib import Path from random import shuffle from typing import Literal import librosa import numpy as np import torch import torchaudio from joblib import Parallel, cpu_count, delayed from tqdm import tqdm from transformers import HubertModel import so_vits_svc_fork.f0 from so_vits_svc_fork import utils from ..hparams import HParams from ..modules.mel_processing import spec_to_mel_torch, spectrogram_torch from ..utils import get_optimal_device, get_total_gpu_memory from .preprocess_utils import check_hubert_min_duration LOG = getLogger(__name__) HUBERT_MEMORY = 2900 HUBERT_MEMORY_CREPE = 3900 def _process_one( *, filepath: Path, content_model: HubertModel, device: torch.device | str = get_optimal_device(), f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", force_rebuild: bool = False, hps: HParams, ): audio, sr = librosa.load(filepath, sr=hps.data.sampling_rate, mono=True) if not check_hubert_min_duration(audio, sr): LOG.info(f"Skip {filepath} because it is too short.") return data_path = filepath.parent / (filepath.name + ".data.pt") if data_path.exists() and not force_rebuild: return # Compute f0 f0 = so_vits_svc_fork.f0.compute_f0(audio, sampling_rate=sr, hop_length=hps.data.hop_length, method=f0_method) f0, uv = so_vits_svc_fork.f0.interpolate_f0(f0) f0 = torch.from_numpy(f0).float() uv = torch.from_numpy(uv).float() # Compute HuBERT content audio = torch.from_numpy(audio).float().to(device) c = utils.get_content( content_model, audio, device, sr=sr, legacy_final_proj=hps.data.get("contentvec_final_proj", True), ) c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0]) torch.cuda.empty_cache() # Compute spectrogram audio, sr = torchaudio.load(filepath) spec = spectrogram_torch(audio, hps).squeeze(0) mel_spec = spec_to_mel_torch(spec, hps) torch.cuda.empty_cache() # fix lengths lmin = min(spec.shape[1], mel_spec.shape[1], f0.shape[0], uv.shape[0], c.shape[1]) spec, mel_spec, f0, uv, c = ( spec[:, :lmin], mel_spec[:, :lmin], f0[:lmin], uv[:lmin], c[:, :lmin], ) # get speaker id spk_name = filepath.parent.name spk = hps.spk.__dict__[spk_name] spk = torch.tensor(spk).long() assert spec.shape[1] == mel_spec.shape[1] == f0.shape[0] == uv.shape[0] == c.shape[1], ( spec.shape, mel_spec.shape, f0.shape, uv.shape, c.shape, ) data = { "spec": spec, "mel_spec": mel_spec, "f0": f0, "uv": uv, "content": c, "audio": audio, "spk": spk, } data = {k: v.cpu() for k, v in data.items()} with data_path.open("wb") as f: torch.save(data, f) def _process_batch(filepaths: Iterable[Path], pbar_position: int, **kwargs): hps = kwargs["hps"] content_model = utils.get_hubert_model(get_optimal_device(), hps.data.get("contentvec_final_proj", True)) for filepath in tqdm(filepaths, position=pbar_position): _process_one( content_model=content_model, filepath=filepath, **kwargs, ) def preprocess_hubert_f0( input_dir: Path | str, config_path: Path | str, n_jobs: int | None = None, f0_method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "dio", force_rebuild: bool = False, ): input_dir = Path(input_dir) config_path = Path(config_path) hps = utils.get_hparams(config_path) if n_jobs is None: # add cpu_count() to avoid SIGKILL memory = get_total_gpu_memory("total") n_jobs = min( max( (memory // (HUBERT_MEMORY_CREPE if f0_method == "crepe" else HUBERT_MEMORY) if memory is not None else 1), 1, ), cpu_count(), ) LOG.info(f"n_jobs automatically set to {n_jobs}, memory: {memory} MiB") filepaths = list(input_dir.rglob("*.wav")) n_jobs = min(len(filepaths) // 16 + 1, n_jobs) shuffle(filepaths) filepath_chunks = np.array_split(filepaths, n_jobs) Parallel(n_jobs=n_jobs)( delayed(_process_batch)( filepaths=chunk, pbar_position=pbar_position, f0_method=f0_method, force_rebuild=force_rebuild, hps=hps, ) for (pbar_position, chunk) in enumerate(filepath_chunks) ) ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_resample.py ================================================ from __future__ import annotations import warnings from collections.abc import Iterable from logging import getLogger from pathlib import Path import librosa import soundfile from joblib import Parallel, delayed from tqdm_joblib import tqdm_joblib from .preprocess_utils import check_hubert_min_duration LOG = getLogger(__name__) # input_dir and output_dir exists. # write code to convert input dir audio files to output dir audio files, # without changing folder structure. Use joblib to parallelize. # Converting audio files includes: # - resampling to specified sampling rate # - trim silence # - adjust volume in a smart way # - save as 16-bit wav file def _get_unique_filename(path: Path, existing_paths: Iterable[Path]) -> Path: """Return a unique path by appending a number to the original path.""" if path not in existing_paths: return path i = 1 while True: new_path = path.parent / f"{path.stem}_{i}{path.suffix}" if new_path not in existing_paths: return new_path i += 1 def is_relative_to(path: Path, *other): """ Return True if the path is relative to another path or False. Python 3.9+ has Path.is_relative_to() method, but we need to support Python 3.8. """ try: path.relative_to(*other) return True except ValueError: return False def _preprocess_one( input_path: Path, output_path: Path, sr: int, *, top_db: int, frame_seconds: float, hop_seconds: float, ) -> None: """Preprocess one audio file.""" try: audio, sr = librosa.load(input_path, sr=sr, mono=True) # Audioread is the last backend it will attempt, so this is the exception thrown on failure except Exception as e: # Failure due to attempting to load a file that is not audio, so return early LOG.warning(f"Failed to load {input_path} due to {e}") return if not check_hubert_min_duration(audio, sr): LOG.info(f"Skip {input_path} because it is too short.") return # Adjust volume audio /= max(audio.max(), -audio.min()) # Trim silence audio, _ = librosa.effects.trim( audio, top_db=top_db, frame_length=int(frame_seconds * sr), hop_length=int(hop_seconds * sr), ) if not check_hubert_min_duration(audio, sr): LOG.info(f"Skip {input_path} because it is too short.") return soundfile.write(output_path, audio, samplerate=sr, subtype="PCM_16") def preprocess_resample( input_dir: Path | str, output_dir: Path | str, sampling_rate: int, n_jobs: int = -1, *, top_db: int = 30, frame_seconds: float = 0.1, hop_seconds: float = 0.05, ) -> None: input_dir = Path(input_dir) output_dir = Path(output_dir) """Preprocess audio files in input_dir and save them to output_dir.""" out_paths = [] in_paths = list(input_dir.rglob("*.*")) if not in_paths: raise ValueError(f"No audio files found in {input_dir}") for in_path in in_paths: in_path_relative = in_path.relative_to(input_dir) if not in_path.is_absolute() and is_relative_to(in_path, Path("dataset_raw") / "44k"): new_in_path_relative = in_path_relative.relative_to("44k") warnings.warn( f"Recommended folder structure has changed since v1.0.0. " "Please move your dataset directly under dataset_raw folder. " f"Recognized {in_path_relative} as {new_in_path_relative}" ) in_path_relative = new_in_path_relative if len(in_path_relative.parts) < 2: continue speaker_name = in_path_relative.parts[0] file_name = in_path_relative.with_suffix(".wav").name out_path = output_dir / speaker_name / file_name out_path = _get_unique_filename(out_path, out_paths) out_path.parent.mkdir(parents=True, exist_ok=True) out_paths.append(out_path) in_and_out_paths = list(zip(in_paths, out_paths)) with tqdm_joblib(desc="Preprocessing", total=len(in_and_out_paths)): Parallel(n_jobs=n_jobs)( delayed(_preprocess_one)( *args, sr=sampling_rate, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds, ) for args in in_and_out_paths ) ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_speaker_diarization.py ================================================ from __future__ import annotations from collections import defaultdict from logging import getLogger from pathlib import Path import librosa import soundfile as sf import torch from joblib import Parallel, delayed from pyannote.audio import Pipeline from tqdm import tqdm from tqdm_joblib import tqdm_joblib LOG = getLogger(__name__) def _process_one( input_path: Path, output_dir: Path, sr: int, *, min_speakers: int = 1, max_speakers: int = 1, huggingface_token: str | None = None, ) -> None: try: audio, sr = librosa.load(input_path, sr=sr, mono=True) except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=huggingface_token) if pipeline is None: raise ValueError("Failed to load pipeline") pipeline = pipeline.to(torch.device("cuda")) LOG.info(f"Processing {input_path}. This may take a while...") diarization = pipeline(input_path, min_speakers=min_speakers, max_speakers=max_speakers) LOG.info(f"Found {len(diarization)} tracks, writing to {output_dir}") speaker_count = defaultdict(int) output_dir.mkdir(parents=True, exist_ok=True) for segment, track, speaker in tqdm(list(diarization.itertracks(yield_label=True)), desc=f"Writing {input_path}"): if segment.end - segment.start < 1: continue speaker_count[speaker] += 1 audio_cut = audio[int(segment.start * sr) : int(segment.end * sr)] sf.write( (output_dir / f"{speaker}_{speaker_count[speaker]:04d}.wav"), audio_cut, sr, ) LOG.info(f"Speaker count: {speaker_count}") def preprocess_speaker_diarization( input_dir: Path | str, output_dir: Path | str, sr: int, *, min_speakers: int = 1, max_speakers: int = 1, huggingface_token: str | None = None, n_jobs: int = -1, ) -> None: if huggingface_token is not None and not huggingface_token.startswith("hf_"): LOG.warning("Huggingface token probably should start with hf_") if not torch.cuda.is_available(): LOG.warning("CUDA is not available. This will be extremely slow.") input_dir = Path(input_dir) output_dir = Path(output_dir) input_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True) input_paths = list(input_dir.rglob("*.*")) with tqdm_joblib(desc="Preprocessing speaker diarization", total=len(input_paths)): Parallel(n_jobs=n_jobs)( delayed(_process_one)( input_path, output_dir / input_path.relative_to(input_dir).parent / input_path.stem, sr, max_speakers=max_speakers, min_speakers=min_speakers, huggingface_token=huggingface_token, ) for input_path in input_paths ) ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_split.py ================================================ from __future__ import annotations from logging import getLogger from pathlib import Path import librosa import soundfile as sf from joblib import Parallel, delayed from tqdm import tqdm from tqdm_joblib import tqdm_joblib LOG = getLogger(__name__) def _process_one( input_path: Path, output_dir: Path, sr: int, *, max_length: float = 10.0, top_db: int = 30, frame_seconds: float = 0.5, hop_seconds: float = 0.1, ): try: audio, sr = librosa.load(input_path, sr=sr, mono=True) except Exception as e: LOG.warning(f"Failed to read {input_path}: {e}") return intervals = librosa.effects.split( audio, top_db=top_db, frame_length=int(sr * frame_seconds), hop_length=int(sr * hop_seconds), ) output_dir.mkdir(parents=True, exist_ok=True) for start, end in tqdm(intervals, desc=f"Writing {input_path}"): for sub_start in range(start, end, int(sr * max_length)): sub_end = min(sub_start + int(sr * max_length), end) audio_cut = audio[sub_start:sub_end] sf.write( (output_dir / f"{input_path.stem}_{sub_start / sr:.3f}_{sub_end / sr:.3f}.wav"), audio_cut, sr, ) def preprocess_split( input_dir: Path | str, output_dir: Path | str, sr: int, *, max_length: float = 10.0, top_db: int = 30, frame_seconds: float = 0.5, hop_seconds: float = 0.1, n_jobs: int = -1, ): input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) input_paths = list(input_dir.rglob("*.*")) with tqdm_joblib(desc="Splitting", total=len(input_paths)): Parallel(n_jobs=n_jobs)( delayed(_process_one)( input_path, output_dir / input_path.relative_to(input_dir).parent, sr, max_length=max_length, top_db=top_db, frame_seconds=frame_seconds, hop_seconds=hop_seconds, ) for input_path in input_paths ) ================================================ FILE: src/so_vits_svc_fork/preprocessing/preprocess_utils.py ================================================ from numpy import ndarray def check_hubert_min_duration(audio: ndarray, sr: int) -> bool: return len(audio) / sr >= 0.3 ================================================ FILE: src/so_vits_svc_fork/py.typed ================================================ ================================================ FILE: src/so_vits_svc_fork/train.py ================================================ from __future__ import annotations import os import warnings from logging import getLogger from multiprocessing import cpu_count from pathlib import Path from typing import Any import lightning.pytorch as pl import torch from lightning.pytorch.accelerators import MPSAccelerator, TPUAccelerator from lightning.pytorch.callbacks import DeviceStatsMonitor from lightning.pytorch.loggers import TensorBoardLogger from lightning.pytorch.strategies.ddp import DDPStrategy from lightning.pytorch.tuner import Tuner from torch.cuda.amp import autocast from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.tensorboard.writer import SummaryWriter import so_vits_svc_fork.f0 import so_vits_svc_fork.modules.commons as commons import so_vits_svc_fork.utils from . import utils from .dataset import TextAudioCollate, TextAudioDataset from .logger import is_notebook from .modules.descriminators import MultiPeriodDiscriminator from .modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss from .modules.mel_processing import mel_spectrogram_torch from .modules.synthesizers import SynthesizerTrn LOG = getLogger(__name__) torch.set_float32_matmul_precision("high") class VCDataModule(pl.LightningDataModule): batch_size: int def __init__(self, hparams: Any): super().__init__() self.__hparams = hparams self.batch_size = hparams.train.batch_size if not isinstance(self.batch_size, int): self.batch_size = 1 self.collate_fn = TextAudioCollate() # these should be called in setup(), but we need to calculate check_val_every_n_epoch self.train_dataset = TextAudioDataset(self.__hparams, is_validation=False) self.val_dataset = TextAudioDataset(self.__hparams, is_validation=True) def train_dataloader(self): return DataLoader( self.train_dataset, num_workers=min(cpu_count(), self.__hparams.train.get("num_workers", 8)), batch_size=self.batch_size, collate_fn=self.collate_fn, persistent_workers=True, ) def val_dataloader(self): return DataLoader( self.val_dataset, batch_size=1, collate_fn=self.collate_fn, ) def train(config_path: Path | str, model_path: Path | str, reset_optimizer: bool = False): config_path = Path(config_path) model_path = Path(model_path) hparams = utils.get_backup_hparams(config_path, model_path) utils.ensure_pretrained_model( model_path, hparams.model.get( "pretrained", { "D_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth", "G_0.pth": "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth", }, ), ) datamodule = VCDataModule(hparams) strategy = ( ("ddp_find_unused_parameters_true" if os.name != "nt" else DDPStrategy(find_unused_parameters=True, process_group_backend="gloo")) if torch.cuda.device_count() > 1 else "auto" ) LOG.info(f"Using strategy: {strategy}") trainer = pl.Trainer( logger=TensorBoardLogger(model_path, "lightning_logs", hparams.train.get("log_version", 0)), # profiler="simple", val_check_interval=hparams.train.eval_interval, max_epochs=hparams.train.epochs, check_val_every_n_epoch=None, precision=("16-mixed" if hparams.train.fp16_run else "bf16-mixed" if hparams.train.get("bf16_run", False) else 32), strategy=strategy, callbacks=([pl.callbacks.RichProgressBar()] if not is_notebook() else []) + [DeviceStatsMonitor()], benchmark=True, enable_checkpointing=False, ) tuner = Tuner(trainer) model = VitsLightning(reset_optimizer=reset_optimizer, **hparams) # automatic batch size scaling batch_size = hparams.train.batch_size batch_split = str(batch_size).split("-") batch_size = batch_split[0] init_val = 2 if len(batch_split) <= 1 else int(batch_split[1]) max_trials = 25 if len(batch_split) <= 2 else int(batch_split[2]) if batch_size == "auto": batch_size = "binsearch" if batch_size in ["power", "binsearch"]: model.tuning = True tuner.scale_batch_size( model, mode=batch_size, datamodule=datamodule, steps_per_trial=1, init_val=init_val, max_trials=max_trials, ) model.tuning = False else: batch_size = int(batch_size) # automatic learning rate scaling is not supported for multiple optimizers """if hparams.train.learning_rate == "auto": lr_finder = tuner.lr_find(model) LOG.info(lr_finder.results) fig = lr_finder.plot(suggest=True) fig.savefig(model_path / "lr_finder.png")""" trainer.fit(model, datamodule=datamodule) class VitsLightning(pl.LightningModule): def __init__(self, reset_optimizer: bool = False, **hparams: Any): super().__init__() self._temp_epoch = 0 # Add this line to initialize the _temp_epoch attribute self.save_hyperparameters("reset_optimizer") self.save_hyperparameters(*[k for k in hparams.keys()]) torch.manual_seed(self.hparams.train.seed) self.net_g = SynthesizerTrn( self.hparams.data.filter_length // 2 + 1, self.hparams.train.segment_size // self.hparams.data.hop_length, **self.hparams.model, ) self.net_d = MultiPeriodDiscriminator(self.hparams.model.use_spectral_norm) self.automatic_optimization = False self.learning_rate = self.hparams.train.learning_rate self.optim_g = torch.optim.AdamW( self.net_g.parameters(), self.learning_rate, betas=self.hparams.train.betas, eps=self.hparams.train.eps, ) self.optim_d = torch.optim.AdamW( self.net_d.parameters(), self.learning_rate, betas=self.hparams.train.betas, eps=self.hparams.train.eps, ) self.scheduler_g = torch.optim.lr_scheduler.ExponentialLR(self.optim_g, gamma=self.hparams.train.lr_decay) self.scheduler_d = torch.optim.lr_scheduler.ExponentialLR(self.optim_d, gamma=self.hparams.train.lr_decay) self.optimizers_count = 2 self.load(reset_optimizer) self.tuning = False def on_train_start(self) -> None: if not self.tuning: self.set_current_epoch(self._temp_epoch) total_batch_idx = self._temp_epoch * len(self.trainer.train_dataloader) self.set_total_batch_idx(total_batch_idx) global_step = total_batch_idx * self.optimizers_count self.set_global_step(global_step) # check if using tpu or mps if isinstance(self.trainer.accelerator, (TPUAccelerator, MPSAccelerator)): # patch torch.stft to use cpu LOG.warning("Using TPU/MPS. Patching torch.stft to use cpu.") def stft( input: torch.Tensor, n_fft: int, hop_length: int | None = None, win_length: int | None = None, window: torch.Tensor | None = None, center: bool = True, pad_mode: str = "reflect", normalized: bool = False, onesided: bool | None = None, return_complex: bool | None = None, ) -> torch.Tensor: device = input.device input = input.cpu() if window is not None: window = window.cpu() return torch.functional.stft( input, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided, return_complex, ).to(device) torch.stft = stft elif "bf" in self.trainer.precision: LOG.warning("Using bf. Patching torch.stft to use fp32.") def stft( input: torch.Tensor, n_fft: int, hop_length: int | None = None, win_length: int | None = None, window: torch.Tensor | None = None, center: bool = True, pad_mode: str = "reflect", normalized: bool = False, onesided: bool | None = None, return_complex: bool | None = None, ) -> torch.Tensor: dtype = input.dtype input = input.float() if window is not None: window = window.float() return torch.functional.stft( input, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided, return_complex, ).to(dtype) torch.stft = stft def on_train_end(self) -> None: self.save_checkpoints(adjust=0) def save_checkpoints(self, adjust=1): if self.tuning or self.trainer.sanity_checking: return # only save checkpoints if we are on the main device if hasattr(self.device, "index") and self.device.index != None and self.device.index != 0: return # `on_train_end` will be the actual epoch, not a -1, so we have to call it with `adjust = 0` current_epoch = self.current_epoch + adjust total_batch_idx = self.total_batch_idx - 1 + adjust utils.save_checkpoint( self.net_g, self.optim_g, self.learning_rate, current_epoch, Path(self.hparams.model_dir) / f"G_{total_batch_idx if self.hparams.train.get('ckpt_name_by_step', False) else current_epoch}.pth", ) utils.save_checkpoint( self.net_d, self.optim_d, self.learning_rate, current_epoch, Path(self.hparams.model_dir) / f"D_{total_batch_idx if self.hparams.train.get('ckpt_name_by_step', False) else current_epoch}.pth", ) keep_ckpts = self.hparams.train.get("keep_ckpts", 0) if keep_ckpts > 0: utils.clean_checkpoints( path_to_models=self.hparams.model_dir, n_ckpts_to_keep=keep_ckpts, sort_by_time=True, ) def set_current_epoch(self, epoch: int): LOG.info(f"Setting current epoch to {epoch}") self.trainer.fit_loop.epoch_progress.current.completed = epoch self.trainer.fit_loop.epoch_progress.current.processed = epoch assert self.current_epoch == epoch, f"{self.current_epoch} != {epoch}" def set_global_step(self, global_step: int): LOG.info(f"Setting global step to {global_step}") self.trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.total.completed = global_step self.trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.total.completed = global_step assert self.global_step == global_step, f"{self.global_step} != {global_step}" def set_total_batch_idx(self, total_batch_idx: int): LOG.info(f"Setting total batch idx to {total_batch_idx}") self.trainer.fit_loop.epoch_loop.batch_progress.total.ready = total_batch_idx + 1 self.trainer.fit_loop.epoch_loop.batch_progress.total.completed = total_batch_idx assert self.total_batch_idx == total_batch_idx + 1, f"{self.total_batch_idx} != {total_batch_idx + 1}" @property def total_batch_idx(self) -> int: return self.trainer.fit_loop.epoch_loop.total_batch_idx + 1 def load(self, reset_optimizer: bool = False): latest_g_path = utils.latest_checkpoint_path(self.hparams.model_dir, "G_*.pth") latest_d_path = utils.latest_checkpoint_path(self.hparams.model_dir, "D_*.pth") if latest_g_path is not None and latest_d_path is not None: try: _, _, _, epoch = utils.load_checkpoint( latest_g_path, self.net_g, self.optim_g, reset_optimizer, ) _, _, _, epoch = utils.load_checkpoint( latest_d_path, self.net_d, self.optim_d, reset_optimizer, ) self._temp_epoch = epoch self.scheduler_g.last_epoch = epoch - 1 self.scheduler_d.last_epoch = epoch - 1 except Exception as e: raise RuntimeError("Failed to load checkpoint") from e else: LOG.warning("No checkpoint found. Start from scratch.") def configure_optimizers(self): return [self.optim_g, self.optim_d], [self.scheduler_g, self.scheduler_d] def log_image_dict(self, image_dict: dict[str, Any], dataformats: str = "HWC") -> None: if not isinstance(self.logger, TensorBoardLogger): warnings.warn("Image logging is only supported with TensorBoardLogger.") return writer: SummaryWriter = self.logger.experiment for k, v in image_dict.items(): try: writer.add_image(k, v, self.total_batch_idx, dataformats=dataformats) except Exception as e: warnings.warn(f"Failed to log image {k}: {e}") def log_audio_dict(self, audio_dict: dict[str, Any]) -> None: if not isinstance(self.logger, TensorBoardLogger): warnings.warn("Audio logging is only supported with TensorBoardLogger.") return writer: SummaryWriter = self.logger.experiment for k, v in audio_dict.items(): writer.add_audio( k, v.float(), self.total_batch_idx, sample_rate=self.hparams.data.sampling_rate, ) def log_dict_(self, log_dict: dict[str, Any], **kwargs) -> None: if not isinstance(self.logger, TensorBoardLogger): warnings.warn("Logging is only supported with TensorBoardLogger.") return writer: SummaryWriter = self.logger.experiment for k, v in log_dict.items(): writer.add_scalar(k, v, self.total_batch_idx) kwargs["logger"] = False self.log_dict(log_dict, **kwargs) def log_(self, key: str, value: Any, **kwargs) -> None: self.log_dict_({key: value}, **kwargs) def training_step(self, batch: dict[str, torch.Tensor], batch_idx: int) -> None: self.net_g.train() self.net_d.train() # get optims optim_g, optim_d = self.optimizers() # Generator # train self.toggle_optimizer(optim_g) c, f0, spec, mel, y, g, lengths, uv = batch ( y_hat, y_hat_mb, ids_slice, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0, ) = self.net_g(c, f0, uv, spec, g=g, c_lengths=lengths, spec_lengths=lengths) y_mel = commons.slice_segments( mel, ids_slice, self.hparams.train.segment_size // self.hparams.data.hop_length, ) y_hat_mel = mel_spectrogram_torch(y_hat.squeeze(1), self.hparams) y_mel = y_mel[..., : y_hat_mel.shape[-1]] y = commons.slice_segments( y, ids_slice * self.hparams.data.hop_length, self.hparams.train.segment_size, ) y = y[..., : y_hat.shape[-1]] # generator loss y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = self.net_d(y, y_hat) with autocast(enabled=False): loss_mel = F.l1_loss(y_mel, y_hat_mel) * self.hparams.train.c_mel loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * self.hparams.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) loss_gen, losses_gen = generator_loss(y_d_hat_g) loss_lf0 = F.mse_loss(pred_lf0, lf0) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0 # MB-iSTFT-VITS loss_subband = torch.tensor(0.0) if self.hparams.model.get("type_") == "mb-istft": from .modules.decoders.mb_istft import PQMF, subband_stft_loss y_mb = PQMF(y.device, self.hparams.model.subbands).analysis(y) loss_subband = subband_stft_loss(self.hparams, y_mb, y_hat_mb) loss_gen_all += loss_subband # log loss self.log_("lr", self.optim_g.param_groups[0]["lr"]) self.log_dict_( { "loss/g/total": loss_gen_all, "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl, "loss/g/lf0": loss_lf0, }, prog_bar=True, ) if self.hparams.model.get("type_") == "mb-istft": self.log_("loss/g/subband", loss_subband) if self.total_batch_idx % self.hparams.train.log_interval == 0: self.log_image_dict( { "slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().float().numpy()), "slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().float().numpy()), "all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().float().numpy()), "all/lf0": so_vits_svc_fork.utils.plot_data_to_numpy( lf0[0, 0, :].cpu().float().numpy(), pred_lf0[0, 0, :].detach().cpu().float().numpy(), ), "all/norm_lf0": so_vits_svc_fork.utils.plot_data_to_numpy( lf0[0, 0, :].cpu().float().numpy(), norm_lf0[0, 0, :].detach().cpu().float().numpy(), ), } ) accumulate_grad_batches = self.hparams.train.get("accumulate_grad_batches", 1) should_update = (batch_idx + 1) % accumulate_grad_batches == 0 or self.trainer.is_last_batch # optimizer self.manual_backward(loss_gen_all / accumulate_grad_batches) if should_update: self.log_("grad_norm_g", commons.clip_grad_value_(self.net_g.parameters(), None)) optim_g.step() optim_g.zero_grad() self.untoggle_optimizer(optim_g) # Discriminator # train self.toggle_optimizer(optim_d) y_d_hat_r, y_d_hat_g, _, _ = self.net_d(y, y_hat.detach()) # discriminator loss with autocast(enabled=False): loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g) loss_disc_all = loss_disc # log loss self.log_("loss/d/total", loss_disc_all, prog_bar=True) # optimizer self.manual_backward(loss_disc_all / accumulate_grad_batches) if should_update: self.log_("grad_norm_d", commons.clip_grad_value_(self.net_d.parameters(), None)) optim_d.step() optim_d.zero_grad() self.untoggle_optimizer(optim_d) # end of epoch if self.trainer.is_last_batch: self.scheduler_g.step() self.scheduler_d.step() def validation_step(self, batch, batch_idx): # avoid logging with wrong global step if self.global_step == 0: return with torch.no_grad(): self.net_g.eval() c, f0, _, mel, y, g, _, uv = batch y_hat = self.net_g.infer(c, f0, uv, g=g) y_hat_mel = mel_spectrogram_torch(y_hat.squeeze(1).float(), self.hparams) self.log_audio_dict({f"gen/audio_{batch_idx}": y_hat[0], f"gt/audio_{batch_idx}": y[0]}) self.log_image_dict( { "gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().float().numpy()), "gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().float().numpy()), } ) def on_validation_end(self) -> None: self.save_checkpoints() ================================================ FILE: src/so_vits_svc_fork/utils.py ================================================ from __future__ import annotations import json import os import re import subprocess import warnings from collections.abc import Sequence from itertools import groupby from logging import getLogger from pathlib import Path from typing import Any, Literal import matplotlib import matplotlib.pylab as plt import numpy as np import requests import torch import torch.backends.mps import torch.nn as nn import torchaudio from cm_time import timer from numpy import ndarray from tqdm import tqdm from transformers import HubertModel from so_vits_svc_fork.hparams import HParams LOG = getLogger(__name__) HUBERT_SAMPLING_RATE = 16000 IS_COLAB = os.getenv("COLAB_RELEASE_TAG", False) def get_optimal_device(index: int = 0) -> torch.device: if torch.cuda.is_available(): return torch.device(f"cuda:{index % torch.cuda.device_count()}") elif torch.backends.mps.is_available(): return torch.device("mps") else: try: import torch_xla.core.xla_model as xm if xm.xrt_world_size() > 0: return torch.device("xla") # return xm.xla_device() except ImportError: pass return torch.device("cpu") def download_file( url: str, filepath: Path | str, chunk_size: int = 64 * 1024, tqdm_cls: type = tqdm, skip_if_exists: bool = False, overwrite: bool = False, **tqdm_kwargs: Any, ): if skip_if_exists is True and overwrite is True: raise ValueError("skip_if_exists and overwrite cannot be both True") filepath = Path(filepath) filepath.parent.mkdir(parents=True, exist_ok=True) temppath = filepath.parent / f"{filepath.name}.download" if filepath.exists(): if skip_if_exists: return elif not overwrite: filepath.unlink() else: raise FileExistsError(f"{filepath} already exists") temppath.unlink(missing_ok=True) resp = requests.get(url, stream=True) total = int(resp.headers.get("content-length", 0)) kwargs = dict( total=total, unit="iB", unit_scale=True, unit_divisor=1024, desc=f"Downloading {filepath.name}", ) kwargs.update(tqdm_kwargs) with temppath.open("wb") as f, tqdm_cls(**kwargs) as pbar: for data in resp.iter_content(chunk_size=chunk_size): size = f.write(data) pbar.update(size) temppath.rename(filepath) PRETRAINED_MODEL_URLS = { "hifi-gan": [ [ "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/D_0.pth", "https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/G_0.pth", ], [ "https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/D_0.pth", "https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/G_0.pth", ], ], "contentvec": [ ["https://huggingface.co/therealvul/so-vits-svc-4.0-init/resolve/main/checkpoint_best_legacy_500.pt"], ["https://huggingface.co/Himawari00/so-vits-svc4.0-pretrain-models/resolve/main/checkpoint_best_legacy_500.pt"], ["http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt"], ], } from joblib import Parallel, delayed def ensure_pretrained_model(folder_path: Path | str, type_: str | dict[str, str], **tqdm_kwargs: Any) -> tuple[Path, ...] | None: folder_path = Path(folder_path) # new code if not isinstance(type_, str): try: Parallel(n_jobs=len(type_))( [ delayed(download_file)( url, folder_path / filename, position=i, skip_if_exists=True, **tqdm_kwargs, ) for i, (filename, url) in enumerate(type_.items()) ] ) return tuple(folder_path / filename for filename in type_.values()) except Exception as e: LOG.error(f"Failed to download {type_}") LOG.exception(e) # old code models_candidates = PRETRAINED_MODEL_URLS.get(type_, None) if models_candidates is None: LOG.warning(f"Unknown pretrained model type: {type_}") return for model_urls in models_candidates: paths = [folder_path / model_url.split("/")[-1] for model_url in model_urls] try: Parallel(n_jobs=len(paths))( [ delayed(download_file)(url, path, position=i, skip_if_exists=True, **tqdm_kwargs) for i, (url, path) in enumerate(zip(model_urls, paths)) ] ) return tuple(paths) except Exception as e: LOG.error(f"Failed to download {model_urls}") LOG.exception(e) class HubertModelWithFinalProj(HubertModel): def __init__(self, config): super().__init__(config) # The final projection layer is only used for backward compatibility. # Following https://github.com/auspicious3000/contentvec/issues/6 # Remove this layer is necessary to achieve the desired outcome. self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) def remove_weight_norm_if_exists(module, name: str = "weight"): r""" Removes the weight normalization reparameterization from a module. Args: module (Module): containing module name (str, optional): name of weight parameter Example: >>> m = weight_norm(nn.Linear(20, 40)) >>> remove_weight_norm(m) """ from torch.nn.utils.weight_norm import WeightNorm for k, hook in module._forward_pre_hooks.items(): if isinstance(hook, WeightNorm) and hook.name == name: hook.remove(module) del module._forward_pre_hooks[k] return module def get_hubert_model(device: str | torch.device, final_proj: bool = True) -> HubertModel: if final_proj: model = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best") else: model = HubertModel.from_pretrained("lengyue233/content-vec-best") # Hubert is always used in inference mode, we can safely remove weight-norms for m in model.modules(): if isinstance(m, (nn.Conv2d, nn.Conv1d)): remove_weight_norm_if_exists(m) return model.to(device) def get_content( cmodel: HubertModel, audio: torch.Tensor | ndarray[Any, Any], device: torch.device | str, sr: int, legacy_final_proj: bool = False, ) -> torch.Tensor: audio = torch.as_tensor(audio) if sr != HUBERT_SAMPLING_RATE: audio = torchaudio.transforms.Resample(sr, HUBERT_SAMPLING_RATE).to(audio.device)(audio).to(device) if audio.ndim == 1: audio = audio.unsqueeze(0) with torch.no_grad(), timer() as t: if legacy_final_proj: warnings.warn("legacy_final_proj is deprecated") if not hasattr(cmodel, "final_proj"): raise ValueError("HubertModel does not have final_proj") c = cmodel(audio, output_hidden_states=True)["hidden_states"][9] c = cmodel.final_proj(c) else: c = cmodel(audio)["last_hidden_state"] c = c.transpose(1, 2) wav_len = audio.shape[-1] / HUBERT_SAMPLING_RATE LOG.info(f"HuBERT inference time : {t.elapsed:.3f}s, RTF: {t.elapsed / wav_len:.3f}") return c def _substitute_if_same_shape(to_: dict[str, Any], from_: dict[str, Any]) -> None: not_in_to = list(filter(lambda x: x not in to_, from_.keys())) not_in_from = list(filter(lambda x: x not in from_, to_.keys())) if not_in_to: warnings.warn(f"Keys not found in model state dict:{not_in_to}") if not_in_from: warnings.warn(f"Keys not found in checkpoint state dict:{not_in_from}") shape_missmatch = [] for k, v in from_.items(): if k not in to_: pass elif hasattr(v, "shape"): if not hasattr(to_[k], "shape"): raise ValueError(f"Key {k} is not a tensor") if to_[k].shape == v.shape: to_[k] = v else: shape_missmatch.append((k, to_[k].shape, v.shape)) elif isinstance(v, dict): assert isinstance(to_[k], dict) _substitute_if_same_shape(to_[k], v) else: to_[k] = v if shape_missmatch: warnings.warn(f"Shape mismatch: {[f'{k}: {v1} -> {v2}' for k, v1, v2 in shape_missmatch]}") def safe_load(model: torch.nn.Module, state_dict: dict[str, Any]) -> None: model_state_dict = model.state_dict() _substitute_if_same_shape(model_state_dict, state_dict) model.load_state_dict(model_state_dict) def load_checkpoint( checkpoint_path: Path | str, model: torch.nn.Module, optimizer: torch.optim.Optimizer | None = None, skip_optimizer: bool = False, ) -> tuple[torch.nn.Module, torch.optim.Optimizer | None, float, int]: if not Path(checkpoint_path).is_file(): raise FileNotFoundError(f"File {checkpoint_path} not found") with Path(checkpoint_path).open("rb") as f: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated") checkpoint_dict = torch.load(f, map_location="cpu", weights_only=True) iteration = checkpoint_dict["iteration"] learning_rate = checkpoint_dict["learning_rate"] # safe load module if hasattr(model, "module"): safe_load(model.module, checkpoint_dict["model"]) else: safe_load(model, checkpoint_dict["model"]) # safe load optim if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None: with warnings.catch_warnings(): warnings.simplefilter("ignore") safe_load(optimizer, checkpoint_dict["optimizer"]) LOG.info(f"Loaded checkpoint '{checkpoint_path}' (epoch {iteration})") return model, optimizer, learning_rate, iteration def save_checkpoint( model: torch.nn.Module, optimizer: torch.optim.Optimizer, learning_rate: float, iteration: int, checkpoint_path: Path | str, ) -> None: LOG.info(f"Saving model and optimizer state at epoch {iteration} to {checkpoint_path}") if hasattr(model, "module"): state_dict = model.module.state_dict() else: state_dict = model.state_dict() with Path(checkpoint_path).open("wb") as f: torch.save( { "model": state_dict, "iteration": iteration, "optimizer": optimizer.state_dict(), "learning_rate": learning_rate, }, f, ) def clean_checkpoints(path_to_models: Path | str, n_ckpts_to_keep: int = 2, sort_by_time: bool = True) -> None: """ Freeing up space by deleting saved ckpts Arguments: path_to_models -- Path to the model directory n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth sort_by_time -- True -> chronologically delete ckpts False -> lexicographically delete ckpts """ LOG.info("Cleaning old checkpoints...") path_to_models = Path(path_to_models) # Define sort key functions name_key = lambda p: int(re.match(r"[GD]_(\d+)", p.stem).group(1)) time_key = lambda p: p.stat().st_mtime path_key = lambda p: (p.stem[0], time_key(p) if sort_by_time else name_key(p)) models = list( filter( lambda p: (p.is_file() and re.match(r"[GD]_\d+", p.stem) and not p.stem.endswith("_0")), path_to_models.glob("*.pth"), ) ) models_sorted = sorted(models, key=path_key) models_sorted_grouped = groupby(models_sorted, lambda p: p.stem[0]) for group_name, group_items in models_sorted_grouped: to_delete_list = list(group_items)[:-n_ckpts_to_keep] for to_delete in to_delete_list: if to_delete.exists(): LOG.info(f"Removing {to_delete}") if IS_COLAB: to_delete.write_text("") to_delete.unlink() def latest_checkpoint_path(dir_path: Path | str, regex: str = "G_*.pth") -> Path | None: dir_path = Path(dir_path) name_key = lambda p: int(re.match(r"._(\d+)\.pth", p.name).group(1)) paths = sorted(dir_path.glob(regex), key=name_key) if len(paths) == 0: return None return paths[-1] def plot_spectrogram_to_numpy(spectrogram: ndarray) -> ndarray: matplotlib.use("Agg") fig, ax = plt.subplots(figsize=(10, 2)) im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") plt.colorbar(im, ax=ax) plt.xlabel("Frames") plt.ylabel("Channels") plt.tight_layout() fig.canvas.draw() data = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8, sep="") data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,)) plt.close() return data def get_backup_hparams(config_path: Path, model_path: Path, init: bool = True) -> HParams: model_path.mkdir(parents=True, exist_ok=True) config_save_path = model_path / "config.json" if init: with config_path.open() as f: data = f.read() with config_save_path.open("w") as f: f.write(data) else: with config_save_path.open() as f: data = f.read() config = json.loads(data) hparams = HParams(**config) hparams.model_dir = model_path.as_posix() return hparams def get_hparams(config_path: Path | str) -> HParams: config = json.loads(Path(config_path).read_text("utf-8")) hparams = HParams(**config) return hparams def repeat_expand_2d(content: torch.Tensor, target_len: int) -> torch.Tensor: # content : [h, t] src_len = content.shape[-1] if target_len < src_len: return content[:, :target_len] else: return torch.nn.functional.interpolate(content.unsqueeze(0), size=target_len, mode="nearest").squeeze(0) def plot_data_to_numpy(x: ndarray, y: ndarray) -> ndarray: matplotlib.use("Agg") fig, ax = plt.subplots(figsize=(10, 2)) plt.plot(x) plt.plot(y) plt.tight_layout() fig.canvas.draw() data = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8, sep="") data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,)) plt.close() return data def get_gpu_memory(type_: Literal["total", "free", "used"]) -> Sequence[int] | None: command = f"nvidia-smi --query-gpu=memory.{type_} --format=csv" try: memory_free_info = subprocess.check_output(command.split()).decode("ascii").split("\n")[:-1][1:] memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)] return memory_free_values except Exception: return def get_total_gpu_memory(type_: Literal["total", "free", "used"]) -> int | None: memories = get_gpu_memory(type_) if memories is None: return return sum(memories) ================================================ FILE: templates/CHANGELOG.md.j2 ================================================ # Changelog {%- for version, release in context.history.released.items() %} ## {{ version.as_tag() }} ({{ release.tagged_date.strftime("%Y-%m-%d") }}) {%- for category, commits in release["elements"].items() %}{% if category != "unknown" %} {# Category title: Breaking, Fix, Documentation #} ### {{ category | capitalize }} {# List actual changes in the category #} {%- for commit in commits %} - {{ commit.descriptions[0] | capitalize }} ([`{{ commit.short_hash }}`]({{ commit.hexsha | commit_hash_url }})) {%- endfor %}{# for commit #} {%- endif %}{% endfor %}{# for category, commits #} {%- endfor %}{# for version, release #} ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/test_main.py ================================================ import json import os from pathlib import Path from unittest import SkipTest, TestCase IS_CI = os.environ.get("GITHUB_ACTIONS", False) IS_COLAB = os.getenv("COLAB_RELEASE_TAG", False) class TestMain(TestCase): def test_import(self): import so_vits_svc_fork.cluster.train_cluster import so_vits_svc_fork.inference.main # import so_vits_svc_fork.modules.onnx._export import so_vits_svc_fork.preprocessing.preprocess_flist_config import so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import so_vits_svc_fork.preprocessing.preprocess_resample import so_vits_svc_fork.preprocessing.preprocess_split import so_vits_svc_fork.train # noqa def test_infer(self): if IS_CI: raise SkipTest("Skip inference test on CI") from so_vits_svc_fork.inference.main import infer # noqa # infer("tests/dataset_raw/34j/1.wav", "tests/configs/config.json", "tests/logs/44k") def test_preprocess(self): from so_vits_svc_fork.preprocessing.preprocess_resample import ( preprocess_resample, ) preprocess_resample("tests/dataset_raw", "tests/dataset/44k", 44100, n_jobs=1 if IS_CI else -1) from so_vits_svc_fork.preprocessing.preprocess_flist_config import ( preprocess_config, ) preprocess_config( "tests/dataset/44k", "tests/filelists/train.txt", "tests/filelists/val.txt", "tests/filelists/test.txt", "tests/configs/44k/config.json", "so-vits-svc-4.0v1", ) if IS_CI: raise SkipTest("Skip hubert and f0 test on CI") from so_vits_svc_fork.preprocessing.preprocess_hubert_f0 import ( preprocess_hubert_f0, ) preprocess_hubert_f0("tests/dataset/44k", "tests/configs/44k/config.json") def test_train(self): if not IS_COLAB: raise SkipTest("Skip training test on non-colab") # requires >10GB of GPU memory, can be only tested on colab from so_vits_svc_fork.train import train config_path = Path("tests/logs/44k/config.json") config_json = json.loads(config_path.read_text("utf-8")) config_json["train"]["epochs"] = 1 config_path.write_text(json.dumps(config_json), "utf-8") train(config_path, "tests/logs/44k")