[
  {
    "path": ".dockerignore",
    "content": "# .dockerignore\n\n# Git and version control\n.git\n.gitignore\n.gitattributes\n.gitmodules\n\n# IDE and editor files\n.vscode/\n.idea/\n*.swp\n*.swo\n*~\n.DS_Store\nThumbs.db\n\n# Python cache and build artifacts\n__pycache__/\n*.py[cod]\n*$py.class\n*.so\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# Virtual environments\nvenv/\nenv/\nENV/\n.venv/\n.env/\n\n# Testing\n.pytest_cache/\n.coverage\nhtmlcov/\n.tox/\n.nox/\ncoverage.xml\n*.cover\n.hypothesis/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n*.ipynb\n\n# Logs\n*.log\nlogs/\n\n# Temporary files\ntmp/\ntemp/\n*.tmp\n*.temp\n\n# OS generated files\n.DS_Store\n.DS_Store?\n._*\n.Spotlight-V100\n.Trashes\nehthumbs.db\nThumbs.db\n\n# Docker files (except the one being used)\ndocker/\nDockerfile*\ndocker-compose*.yml\n.dockerignore\n\n# Checkpoints and models (should be mounted)\ncheckpoints/\nmodels/\n*.pth\n*.ckpt\n*.safetensors\n*.bin\n\n# Reference voices (should be mounted)\nreferences/\n\n# Generated audio files\n*.wav\n*.mp3\n*.flac\n*.ogg\ngenerated_audio.wav\nfake.wav\nfake.npy\n\n# Cache directories\n.cache/\ncache/\n.uv_cache/\n\n# Development files\n.env\n.env.local\n.env.development\n.env.test\n.env.production\n\n# Test files\ntest_*.py\n*_test.py\ntests/\n\n# CI/CD\n.github/\n.gitlab-ci.yml\n.travis.yml\n.circleci/\nazure-pipelines.yml\n\n# Monitoring and profiling\n.prof\n*.prof\n\n# Backup files\n*.bak\n*.backup\n*.old\n\n# Large data files\n*.csv\n*.jsonl\n*.parquet\n*.h5\n*.hdf5\n\n# Audio processing temporary files\n*.tmp.wav\n*.temp.wav\n\n# OLD:\n# .github\n# results\n# data\n# *.filelist\n# /data_server/target\n# checkpoints\n# .venv\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "content": "name: \"🕷️ Bug report\"\ndescription: |\n  Please follow this template carefully to ensure we can address your issue quickly.\n  Make sure to provide as much detail as possible, including logs and screenshots.\nlabels:\n  - bug\nbody:\n  - type: checkboxes\n    attributes:\n      label: Self Checks\n      description: \"To ensure timely help, please confirm the following:\"\n      options:\n        - label: This template is only for bug reports. For questions, please visit [Discussions](https://github.com/fishaudio/fish-speech/discussions).\n          required: true\n        - label: I have thoroughly reviewed the project documentation (installation, training, inference) but couldn't find information to solve my problem. [English](https://speech.fish.audio/) [中文](https://speech.fish.audio/zh/) [日本語](https://speech.fish.audio/ja/) [Portuguese (Brazil)](https://speech.fish.audio/pt/)\n          required: true\n        - label: I have searched for existing issues, including closed ones. [Search issues](https://github.com/fishaudio/fish-speech/issues)\n          required: true\n        - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)).\n          required: true\n        - label: \"[FOR CHINESE USERS] 请务必使用英文提交 Issue，否则会被关闭。谢谢！:）\"\n          required: true\n        - label: \"Please do not modify this template and fill in all required fields.\"\n          required: true\n  - type: dropdown\n    attributes:\n      label: Cloud or Self Hosted\n      multiple: true\n      options:\n        - Cloud\n        - Self Hosted (Docker)\n        - Self Hosted (Source)\n    validations:\n      required: true\n  - type: textarea\n    attributes:\n      label: Environment Details\n      description: \"Provide details such as OS, Python version, and any relevant software or dependencies.\"\n      placeholder: e.g., macOS 13.5, Python 3.10, torch==2.4.1, Gradio 4.44.0\n    validations:\n      required: true\n  - type: textarea\n    attributes:\n      label: Steps to Reproduce\n      description: |\n        Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.\n      placeholder: |\n        1. Run the command `python -m tools.api_client -t \"xxxxx\"`\n        2. Observe the console output error: `ModuleNotFoundError: No module named 'pyaudio'` (with screenshots or logs will be better)\n    validations:\n      required: true\n  - type: textarea\n    attributes:\n      label: ✔️ Expected Behavior\n      placeholder: Describe what you expected to happen.\n    validations:\n      required: false\n  - type: textarea\n    attributes:\n      label: ❌ Actual Behavior\n      placeholder: Describe what actually happened.\n    validations:\n      required: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: false\ncontact_links:\n  - name: \"\\U0001F4E7 Discussions\"\n    url: https://github.com/fishaudio/fish-speech/discussions\n    about: General discussions and request help from the community\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.yml",
    "content": "name: \"⭐ Feature or enhancement request\"\ndescription: Propose something new.\nlabels:\n  - enhancement\nbody:\n  - type: checkboxes\n    attributes:\n      label: Self Checks\n      description: \"To make sure we get to you in time, please check the following :)\"\n      options:\n        - label: I have thoroughly reviewed the project documentation (installation, training, inference) but couldn't find any relevant information that meets my needs. [English](https://speech.fish.audio/) [中文](https://speech.fish.audio/zh/) [日本語](https://speech.fish.audio/ja/) [Portuguese (Brazil)](https://speech.fish.audio/pt/)\n          required: true\n        - label: I have searched for existing issues [search for existing issues]([https://github.com/langgenius/dify/issues](https://github.com/fishaudio/fish-speech/issues)), including closed ones.\n          required: true\n        - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)).\n          required: true\n        - label: \"[FOR CHINESE USERS] 请务必使用英文提交 Issue，否则会被关闭。谢谢！:）\"\n          required: true\n        - label: \"Please do not modify this template :) and fill in all the required fields.\"\n          required: true\n\n  - type: textarea\n    attributes:\n      label: 1. Is this request related to a challenge you're experiencing? Tell us your story.\n      description: |\n        Describe the specific problem or scenario you’re facing in detail. For example:\n        *\"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because....\"*\n      placeholder: Please describe the situation in as much detail as possible.\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: 2. What is your suggested solution?\n      description: |\n        Provide a clear description of the feature or enhancement you'd like to propose. \n        How would this feature solve your issue or improve the project?\n      placeholder: Describe your idea or proposed solution here.\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: 3. Additional context or comments\n      description: |\n        Any other relevant information, links, documents, or screenshots that provide clarity. \n        Use this section for anything not covered above.\n      placeholder: Add any extra details here.\n    validations:\n      required: false\n\n  - type: checkboxes\n    attributes:\n      label: 4. Can you help us with this feature?\n      description: |\n        Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.\n      options:\n        - label: I am interested in contributing to this feature.\n          required: false\n\n  - type: markdown\n    attributes:\n      value: |\n        **Note:** Please submit only one request per issue to keep discussions focused and manageable.\n"
  },
  {
    "path": ".github/pull_request_template.md",
    "content": "**Is this PR adding new feature or fix a BUG?**\n\nAdd feature / Fix BUG.\n\n**Is this pull request related to any issue? If yes, please link the issue.**\n\n#xxx\n"
  },
  {
    "path": ".github/workflows/build-docker-image.yml",
    "content": "name: Build Docker Images\n\non:\n  push:\n    branches:\n      - main\n    tags:\n      - \"v*\"\n\njobs:\n  build:\n    runs-on: ubuntu-latest-16c64g\n    strategy:\n      matrix:\n        target: [webui, server]\n        backend: [cuda, cpu]\n    steps:\n      - uses: actions/checkout@v4\n      \n      - name: Set up Docker Buildx\n        uses: docker/setup-buildx-action@v3\n      \n      - name: Get Version\n        run: |\n          if [[ $GITHUB_REF == refs/tags/v* ]]; then\n            version=$(basename ${GITHUB_REF})\n          else\n            version=nightly\n          fi\n          echo \"version=${version}\" >> $GITHUB_ENV\n          echo \"Current version: ${version}\"\n\n      - name: Login to Docker Hub\n        uses: docker/login-action@v3\n        with:\n          username: ${{ secrets.DOCKER_USER }}\n          password: ${{ secrets.DOCKER_PAT }}\n\n      - name: Set platform for CPU builds\n        id: platform\n        run: |\n          if [ \"${{ matrix.backend }}\" = \"cpu\" ]; then\n            echo \"platforms=linux/amd64,linux/arm64\" >> $GITHUB_OUTPUT\n          else\n            echo \"platforms=linux/amd64\" >> $GITHUB_OUTPUT\n          fi\n\n      - name: Build and Push ${{ matrix.target }}-${{ matrix.backend }} Image\n        uses: docker/build-push-action@v6\n        with:\n          context: .\n          file: docker/Dockerfile\n          platforms: ${{ steps.platform.outputs.platforms }}\n          push: true\n          target: ${{ matrix.target }}\n          build-args: |\n            BACKEND=${{ matrix.backend }}\n            UV_EXTRA=${{ matrix.backend == 'cuda' && 'cu126' || 'cpu' }}\n          tags: |\n            fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }}-${{ env.version }}\n            fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }}\n            ${{ (matrix.target == 'webui' && matrix.backend == 'cuda') && format('fishaudio/fish-speech:{0}', env.version) || '' }}\n            ${{ (matrix.target == 'webui' && matrix.backend == 'cuda') && 'fishaudio/fish-speech:latest' || '' }}\n          outputs: type=image,oci-mediatypes=true,compression=zstd,compression-level=3,force-compression=true\n          cache-from: type=registry,ref=fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }}\n          cache-to: type=inline\n\n  update-readme:\n    runs-on: ubuntu-latest\n    needs: build\n    if: github.ref == 'refs/heads/main'\n    steps:\n      - name: Push README to Dockerhub\n        uses: peter-evans/dockerhub-description@v4\n        with:\n          username: ${{ secrets.DOCKER_USER }}\n          password: ${{ secrets.DOCKER_PAT }}\n          repository: fishaudio/fish-speech\n"
  },
  {
    "path": ".github/workflows/docs.yml",
    "content": "name: docs\non:\n  push:\n    branches:\n      - main\n    paths:\n      - 'docs/**'\n      - 'mkdocs.yml'\n\npermissions:\n  contents: write\n\njobs:\n  deploy:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n      - name: Configure Git Credentials\n        run: |\n          git config user.name github-actions[bot]\n          git config user.email 41898282+github-actions[bot]@users.noreply.github.com\n      - uses: actions/setup-python@v5\n        with:\n          python-version: 3.x\n      - run: echo \"cache_id=$(date --utc '+%V')\" >> $GITHUB_ENV \n      - uses: actions/cache@v4\n        with:\n          key: mkdocs-material-${{ env.cache_id }}\n          path: .cache\n          restore-keys: |\n            mkdocs-material-\n      - run: pip install -r docs/requirements.txt\n      - run: mkdocs gh-deploy --force\n"
  },
  {
    "path": ".github/workflows/stale.yml",
    "content": "name: Close inactive issues\non:\n  schedule:\n    - cron: \"0 0 * * *\"\n\njobs:\n  close-issues:\n    runs-on: ubuntu-latest\n    permissions:\n      issues: write\n      pull-requests: write\n    steps:\n      - uses: actions/stale@v9\n        with:\n          days-before-issue-stale: 30\n          days-before-issue-close: 14\n          stale-issue-label: \"stale\"\n          stale-issue-message: \"This issue is stale because it has been open for 30 days with no activity.\"\n          close-issue-message: \"This issue was closed because it has been inactive for 14 days since being marked as stale.\"\n          days-before-pr-stale: 30\n          days-before-pr-close: 30\n          stale-pr-label: \"stale\"\n          stale-pr-message: \"This PR is stale because it has been open for 30 days with no activity.\"\n          close-pr-message: \"This PR was closed because it has been inactive for 30 days since being marked as stale.\"\n          repo-token: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "# =============================================================================\n# Fish Speech - .gitignore\n# =============================================================================\n\n# Operating System Files\n# -----------------------\n.DS_Store\n.DS_Store?\n._*\n.Spotlight-V100\n.Trashes\nehthumbs.db\nThumbs.db\n\n# IDEs and Editors\n# ----------------\n.vscode/\n.idea/\n*.swp\n*.swo\n*~\n\n# Python\n# ------\n__pycache__/\n*.py[cod]\n*$py.class\n*.so\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# Virtual Environments\n# --------------------\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n/fishenv/\n\n# Project Dependencies\n# --------------------\n.pdm-python\n/fish_speech.egg-info\n\n# Data and Model Files\n# --------------------\ndata/\nresults/\ncheckpoints/\nreferences/\ndemo-audios/\nexample/\nfilelists/\n*.filelist\n\n# Audio Files\n# -----------\n*.wav\n*.mp3\n*.flac\n*.ogg\n*.m4a\n\n# Data Files\n# ----------\n*.npy\n*.npz\n*.pkl\n*.pickle\n*.lab\n/fish_speech/text/cmudict_cache.pickle\n\n# Cache and Temporary Files\n# --------------------------\n/.cache/\n/.gradio/\n/.locale/\n.pgx.*\n*log\n*.log\nsite/\n\n# External Tools\n# --------------\nffmpeg.exe\nffprobe.exe\n/faster_whisper/\n\n# Server Related\n# --------------\n/data_server/target/\n\n# Test Files\n# ----------\n/*.test.sh\nasr-label*\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "ci:\n  autoupdate_schedule: monthly\n\nrepos:\n  - repo: https://github.com/pycqa/isort\n    rev: 8.0.1\n    hooks:\n      - id: isort\n        args: [--profile=black]\n\n  - repo: https://github.com/psf/black-pre-commit-mirror\n    rev: 26.1.0\n    hooks:\n      - id: black\n\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    hooks:\n      - id: end-of-file-fixer\n      - id: check-yaml\n      - id: check-json\n      - id: mixed-line-ending\n        args: [\"--fix=lf\"]\n      - id: check-added-large-files\n        args: [\"--maxkb=5000\"]\n"
  },
  {
    "path": ".project-root",
    "content": ""
  },
  {
    "path": ".readthedocs.yaml",
    "content": "# Read the Docs configuration file for MkDocs projects\n# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details\n\n# Required\nversion: 2\n\n# Set the version of Python and other tools you might need\nbuild:\n  os: ubuntu-22.04\n  tools:\n    python: \"3.12\"\n\nmkdocs:\n  configuration: mkdocs.yml\n\n# Optionally declare the Python requirements required to build your docs\npython:\n  install:\n  - requirements: docs/requirements.txt\n"
  },
  {
    "path": "API_FLAGS.txt",
    "content": "# --infer\n--api\n--listen 0.0.0.0:8080 \\\n--llama-checkpoint-path \"checkpoints/openaudio-s1-mini\" \\\n--decoder-checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\" \\\n--decoder-config-name modded_dac_vq\n"
  },
  {
    "path": "LICENSE",
    "content": "# FISH AUDIO RESEARCH LICENSE AGREEMENT\n\n**Last Updated: March 7, 2026**\n\n## I. INTRODUCTION\n\nThis Agreement applies to any individual person or entity (\"You\", \"Your\" or \"Licensee\") that uses or distributes any portion or element of the Fish Audio Materials or Derivative Works thereof for any Research, Non-Commercial, or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.\n\nThis Agreement is intended to allow research and non-commercial uses of the Materials free of charge. Any Commercial use of the Materials requires a separate license from Fish Audio.\n\nBy clicking \"I Accept\" or by using, distributing, or accessing any portion or element of the Fish Audio Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then \"You\" includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity's behalf.\n\n## II. RESEARCH & NON-COMMERCIAL USE LICENSE\n\nSubject to the terms of this Agreement, Fish Audio grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Fish Audio's intellectual property or other rights owned by Fish Audio embodied in the Fish Audio Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Fish Audio Materials for any Research or Non-Commercial Purpose.\n\n\"Research Purpose\" means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others.\n\n\"Non-Commercial Purpose\" means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.\n\n## III. COMMERCIAL USE\n\n**Any use of the Fish Audio Materials or Derivative Works for a Commercial Purpose requires a separate written license agreement from Fish Audio.** No commercial rights are granted under this Agreement.\n\n\"Commercial Purpose\" means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for or directed toward commercial advantage or monetary compensation to You or others, including but not limited to: (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, (ii) Your business's or organization's internal operations, and (iii) any use in connection with a product or service for which You charge a fee or generate revenue, whether directly or indirectly.\n\nTo obtain a commercial license, please contact Fish Audio at:\n\n- **Website:** [https://fish.audio](https://fish.audio)\n- **Email:** business@fish.audio\n\n## IV. GENERAL TERMS\n\nYour Research and Non-Commercial License under this Agreement is subject to the following terms.\n\n### a. Distribution & Attribution\n\nIf You distribute or make available the Fish Audio Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a \"Notice\" text file distributed as a part of such copies: \"This model is licensed under the Fish Audio Research License, Copyright © 39 AI, INC. All Rights Reserved.\", and (iii) prominently display \"Built with Fish Audio\" on a related website, user interface, blogpost, about page, or product documentation.\n\nIf You create a Derivative Work, You may add your own attribution notice(s) to the \"Notice\" text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Fish Audio Materials and state in the \"Notice\" text file that You changed the Fish Audio Materials and how it was modified.\n\n### b. Use Restrictions\n\nYour use of the Fish Audio Materials and Derivative Works, including any output or results of the Fish Audio Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to Fish Audio's Acceptable Use Policy, which is hereby incorporated by reference.\n\nFurthermore, You will not use the Fish Audio Materials or Derivative Works, or any output or results of the Fish Audio Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).\n\n### c. Intellectual Property\n\n**(i) Trademark License.** No trademark licenses are granted under this Agreement, and in connection with the Fish Audio Materials or Derivative Works, You may not use any name or mark owned by or associated with Fish Audio or any of its Affiliates, except as required under Section IV(a) herein.\n\n**(ii) Ownership of Derivative Works.** As between You and Fish Audio, You are the owner of Derivative Works You create, subject to Fish Audio's ownership of the Fish Audio Materials and any Derivative Works made by or for Fish Audio.\n\n**(iii) Ownership of Outputs.** As between You and Fish Audio, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.\n\n**(iv) Disputes.** If You or Your Affiliate(s) institute litigation or other proceedings against Fish Audio (including a cross-claim or counterclaim in a lawsuit) alleging that the Fish Audio Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Fish Audio from and against any claim by any third party arising out of or related to Your use or distribution of the Fish Audio Materials or Derivative Works in violation of this Agreement.\n\n**(v) Feedback.** From time to time, You may provide Fish Audio with verbal and/or written suggestions, comments or other feedback related to Fish Audio's existing or prospective technology, products or services (collectively, \"Feedback\"). You are not obligated to provide Fish Audio with Feedback, but to the extent that You do, You hereby grant Fish Audio a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided \"AS IS\" and You make no warranties whatsoever about any Feedback.\n\n### d. Disclaimer of Warranty\n\nUNLESS REQUIRED BY APPLICABLE LAW, THE FISH AUDIO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN \"AS IS\" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE FISH AUDIO MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE FISH AUDIO MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.\n\n### e. Limitation of Liability\n\nIN NO EVENT WILL FISH AUDIO OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF FISH AUDIO OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.\n\n### f. Term and Termination\n\nThe term of this Agreement will commence upon Your acceptance of this Agreement or access to the Fish Audio Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Fish Audio may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Fish Audio Materials or Derivative Works. Sections IV(d), (e), and (g) shall survive the termination of this Agreement.\n\n### g. Governing Law\n\nThis Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.\n\n## V. DEFINITIONS\n\n**\"Affiliate(s)\"** means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, \"control\" means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.\n\n**\"Agreement\"** means this Fish Audio Research License Agreement.\n\n**\"Derivative Work(s)\"** means (a) any derivative work of the Fish Audio Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model's output, including \"fine tune\" and \"low-rank adaptation\" models derived from a Model or a Model's output, but do not include the output of any Model.\n\n**\"Documentation\"** means any specifications, manuals, documentation, and other written information provided by Fish Audio related to the Software or Models.\n\n**\"Fish Audio\"** or **\"we\"** means 39 AI, INC. and its Affiliates.\n\n**\"Model(s)\"** means, collectively, Fish Audio's proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing.\n\n**\"Software\"** means Fish Audio's proprietary software made available under this Agreement now or in the future.\n\n**\"Fish Audio Materials\"** means, collectively, Fish Audio's proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.\n\n**\"Trade Control Laws\"** means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n**English** | [简体中文](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [日本語](docs/README.ja.md) | [한국어](docs/README.ko.md) | [العربية](docs/README.ar.md) <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **License Notice**  \n> This codebase and its associated model weights are released under **[FISH AUDIO RESEARCH LICENSE](LICENSE)**. Please refer to [LICENSE](LICENSE) for more details. We will take action against any violation of the license.\n\n> [!WARNING]\n> **Legal Disclaimer**  \n> We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.\n\n## Quick Start\n\n### For Human\n\nHere are the official documents for Fish Audio S2, follow the instructions to get started easily.\n\n- [Installation](https://speech.fish.audio/install/)\n- [Command Line Inference](https://speech.fish.audio/inference/#command-line-inference)\n- [WebUI Inference](https://speech.fish.audio/inference/#webui-inference)\n- [Server Inference](https://speech.fish.audio/server/)\n- [Docker Setup](https://speech.fish.audio/install/#docker-setup)\n\n> [!IMPORTANT]\n> **For SGLang server, please read [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### For LLM Agent\n\n```\nInstall and configure Fish-Audio S2 by following the instructions here: https://speech.fish.audio/install/\n```\n\n## Fish Audio S2 Pro\n**State-of-the-art multilingual text-to-speech (TTS) system, redefining the boundaries of voice generation.**\n\nFish Audio S2 Pro is the most advanced multimodal model developed by [Fish Audio](https://fish.audio/). Trained on over **10 million hours** of audio data covering more than **80 languages**, S2 Pro combines a **Dual-Autoregressive (Dual-AR)** architecture with reinforcement learning (RL) alignment to generate speech that is exceptionally natural, realistic, and emotionally rich, leading the competition among both open-source and closed-source systems.\n\nThe core strength of S2 Pro lies in its support for **sub-word level** fine-grained control of prosody and emotion using natural language tags (e.g., `[whisper]`, `[excited]`, `[angry]`), while natively supporting multi-speaker and multi-turn conversation generation.\n\nVisit the [Fish Audio website](https://fish.audio/) for a live playground, or read our [technical report](https://arxiv.org/abs/2603.08823) and [blog post](https://fish.audio/blog/fish-audio-open-sources-s2/) for more details.\n\n### Model Variants\n\n| Model | Size | Availability | Description |\n|------|------|-------------|-------------|\n| S2-Pro | 4B parameters | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Full-featured flagship model with maximum quality and stability |\n\nMore details of the model can be found in the [technical report](https://arxiv.org/abs/2411.01156).\n\n## Benchmark Results\n\n| Benchmark | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (Chinese) | **0.54%** (best overall) |\n| Seed-TTS Eval — WER (English) | **0.99%** (best overall) |\n| Audio Turing Test (with instruction) | **0.515** posterior mean |\n| EmergentTTS-Eval — Win Rate | **81.88%** (highest overall) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — Quality | **4.51 / 5.0** |\n| Multilingual (MiniMax Testset) — Best WER | **11 of 24** languages |\n| Multilingual (MiniMax Testset) — Best SIM | **17 of 24** languages |\n\nOn Seed-TTS Eval, S2 achieves the lowest WER among all evaluated models including closed-source systems: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). On the Audio Turing Test, 0.515 surpasses Seed-TTS (0.417) by 24% and MiniMax-Speech (0.387) by 33%. On EmergentTTS-Eval, S2 achieves particularly strong results in paralinguistics (91.61% win rate), questions (84.41%), and syntactic complexity (83.39%).\n\n## Highlights\n\n<img src=\"./docs/assets/totalability.png\" width=200%>\n\n### Fine-Grained Inline Control via Natural Language\n\nS2 Pro brings unprecedented \"soul\" to speech. Using simple `[tag]` syntax, you can precisely embed emotional instructions at any position in the text.\n- **15,000+ Unique Tags Supported**: Not limited to fixed presets; S2 supports **free-form text descriptions**. Try `[whisper in small voice]`, `[professional broadcast tone]`, or `[pitch up]`.\n- **Rich Emotion Library**:\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### Innovative Dual-Autoregressive (Dual-AR) Architecture\n\nS2 Pro adopts a master-slave Dual-AR architecture consisting of a decoder-only transformer and an RVQ audio codec (10 codebooks, ~21 Hz):\n\n- **Slow AR (4B parameters)**: Operates along the time axis, predicting the primary semantic codebook.\n- **Fast AR (400M parameters)**: Generates the remaining 9 residual codebooks at each time step, reconstructing exquisite acoustic details.\n\nThis asymmetric design achieves peak audio fidelity while significantly boosting inference speed.\n\n### Reinforcement Learning (RL) Alignment\n\nS2 Pro utilizes **Group Relative Policy Optimization (GRPO)** for post-training alignment. We use the same model suite for data cleaning and annotation directly as Reward Models, perfectly resolving the distribution mismatch between pre-training data and post-training objectives.\n- **Multi-Dimensional Reward Signals**: Comprehensively evaluates semantic accuracy, instruction adherence, acoustic preference scoring, and timbre similarity to ensure every second of generated speech feels intuitive to humans.\n\n### Extreme Streaming Performance (Powered by SGLang)\n\nAs the Dual-AR architecture is structurally isomorphic to standard LLMs, S2 Pro natively supports all SGLang inference acceleration features, including Continuous Batching, Paged KV Cache, CUDA Graph, and RadixAttention-based Prefix Caching.\n\n**Performance on a single NVIDIA H200 GPU:**\n- **Real-Time Factor (RTF)**: 0.195\n- **Time-to-First-Audio (TTFA)**: ~100 ms\n- **Extreme Throughput**: 3,000+ acoustic tokens/s while maintaining RTF < 0.5\n\n### Robust Multilingual Support\n\nS2 Pro supports over 80 languages without requiring phonemes or language-specific preprocessing:\n\n- **Tier 1**: Japanese (ja), English (en), Chinese (zh)\n- **Tier 2**: Korean (ko), Spanish (es), Portuguese (pt), Arabic (ar), Russian (ru), French (fr), German (de)\n- **Global Coverage**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo, etc.\n\n### Native Multi-Speaker Generation\n\n<img src=\"./docs/assets/chattemplate.png\" width=200%>\n\nFish Audio S2 allows users to upload reference audio containing multiple speakers, and the model processes each speaker's features via the `<|speaker:i|>` token. You can then control the model's performance via speaker ID tokens, enabling a single generation to include multiple speakers. There is no longer a need to upload separate reference audio for each individual speaker.\n\n### Multi-Turn Generation\n\nThanks to the expansion of the model context, our model can now leverage previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the dialogue.\n\n### Rapid Voice Cloning\n\nFish Audio S2 supports accurate voice cloning using short reference samples (typically 10-30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning.\nFor SGLang Server usage, please refer to the [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).\n\n---\n\n## Credits\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## Tech Report\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "awesome_webui/.gitignore",
    "content": "# Logs\nlogs\n*.log\nnpm-debug.log*\nyarn-debug.log*\nyarn-error.log*\npnpm-debug.log*\nlerna-debug.log*\n\nnode_modules\ndist\ndist-ssr\n*.local\n\n# Editor directories and files\n.vscode/*\n!.vscode/extensions.json\n.idea\n.DS_Store\n*.suo\n*.ntvs*\n*.njsproj\n*.sln\n*.sw?\n"
  },
  {
    "path": "awesome_webui/README.md",
    "content": "# React + TypeScript + Vite\n\nThis template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.\n\nCurrently, two official plugins are available:\n\n- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh\n- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh\n\n## React Compiler\n\nThe React Compiler is currently not compatible with SWC. See [this issue](https://github.com/vitejs/vite-plugin-react/issues/428) for tracking the progress.\n\n## Expanding the ESLint configuration\n\nIf you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:\n\n```js\nexport default defineConfig([\n  globalIgnores(['dist']),\n  {\n    files: ['**/*.{ts,tsx}'],\n    extends: [\n      // Other configs...\n\n      // Remove tseslint.configs.recommended and replace with this\n      tseslint.configs.recommendedTypeChecked,\n      // Alternatively, use this for stricter rules\n      tseslint.configs.strictTypeChecked,\n      // Optionally, add this for stylistic rules\n      tseslint.configs.stylisticTypeChecked,\n\n      // Other configs...\n    ],\n    languageOptions: {\n      parserOptions: {\n        project: ['./tsconfig.node.json', './tsconfig.app.json'],\n        tsconfigRootDir: import.meta.dirname,\n      },\n      // other options...\n    },\n  },\n])\n```\n\nYou can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:\n\n```js\n// eslint.config.js\nimport reactX from 'eslint-plugin-react-x'\nimport reactDom from 'eslint-plugin-react-dom'\n\nexport default defineConfig([\n  globalIgnores(['dist']),\n  {\n    files: ['**/*.{ts,tsx}'],\n    extends: [\n      // Other configs...\n      // Enable lint rules for React\n      reactX.configs['recommended-typescript'],\n      // Enable lint rules for React DOM\n      reactDom.configs.recommended,\n    ],\n    languageOptions: {\n      parserOptions: {\n        project: ['./tsconfig.node.json', './tsconfig.app.json'],\n        tsconfigRootDir: import.meta.dirname,\n      },\n      // other options...\n    },\n  },\n])\n```\n"
  },
  {
    "path": "awesome_webui/eslint.config.js",
    "content": "import js from '@eslint/js'\nimport globals from 'globals'\nimport reactHooks from 'eslint-plugin-react-hooks'\nimport reactRefresh from 'eslint-plugin-react-refresh'\nimport tseslint from 'typescript-eslint'\nimport { defineConfig, globalIgnores } from 'eslint/config'\n\nexport default defineConfig([\n  globalIgnores(['dist']),\n  {\n    files: ['**/*.{ts,tsx}'],\n    extends: [\n      js.configs.recommended,\n      tseslint.configs.recommended,\n      reactHooks.configs.flat.recommended,\n      reactRefresh.configs.vite,\n    ],\n    languageOptions: {\n      ecmaVersion: 2020,\n      globals: globals.browser,\n    },\n  },\n])\n"
  },
  {
    "path": "awesome_webui/index.html",
    "content": "<!doctype html>\n<html lang=\"en\">\n\n<head>\n  <meta charset=\"UTF-8\" />\n  <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n  <title>Awesome WebUI</title>\n</head>\n\n<body>\n  <div id=\"root\"></div>\n  <script type=\"module\" src=\"/src/main.tsx\"></script>\n</body>\n\n</html>\n"
  },
  {
    "path": "awesome_webui/package.json",
    "content": "{\n  \"name\": \"awesome_webui\",\n  \"private\": true,\n  \"version\": \"0.0.0\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"dev\": \"vite\",\n    \"build\": \"tsc -b && vite build\",\n    \"lint\": \"eslint .\",\n    \"preview\": \"vite preview\"\n  },\n  \"dependencies\": {\n    \"@radix-ui/react-collapsible\": \"^1.1.12\",\n    \"@radix-ui/react-dialog\": \"^1.1.15\",\n    \"@radix-ui/react-label\": \"^2.1.8\",\n    \"@radix-ui/react-scroll-area\": \"^1.2.10\",\n    \"@radix-ui/react-separator\": \"^1.1.8\",\n    \"@radix-ui/react-slider\": \"^1.3.6\",\n    \"@radix-ui/react-slot\": \"^1.2.4\",\n    \"@radix-ui/react-switch\": \"^1.2.6\",\n    \"@radix-ui/react-toggle-group\": \"^1.1.11\",\n    \"@tailwindcss/vite\": \"^4.2.1\",\n    \"class-variance-authority\": \"^0.7.1\",\n    \"clsx\": \"^2.1.1\",\n    \"lucide-react\": \"^0.577.0\",\n    \"react\": \"^19.2.0\",\n    \"react-dom\": \"^19.2.0\",\n    \"tailwind-merge\": \"^3.5.0\",\n    \"tailwindcss\": \"^4.2.1\"\n  },\n  \"devDependencies\": {\n    \"@eslint/js\": \"^9.39.1\",\n    \"@types/node\": \"^24.10.1\",\n    \"@types/react\": \"^19.2.7\",\n    \"@types/react-dom\": \"^19.2.3\",\n    \"@vitejs/plugin-react-swc\": \"^4.2.2\",\n    \"eslint\": \"^9.39.1\",\n    \"eslint-plugin-react-hooks\": \"^7.0.1\",\n    \"eslint-plugin-react-refresh\": \"^0.4.24\",\n    \"globals\": \"^16.5.0\",\n    \"typescript\": \"~5.9.3\",\n    \"typescript-eslint\": \"^8.48.0\",\n    \"vite\": \"^7.3.1\"\n  }\n}\n"
  },
  {
    "path": "awesome_webui/src/App.tsx",
    "content": "import { useEffect, useRef, useState } from 'react'\nimport {\n  AudioLines,\n  ChevronDown,\n  CircleAlert,\n  Copy,\n  Download,\n  FileText,\n  Info,\n  LoaderCircle,\n  Plus,\n  Settings2,\n  Upload,\n} from 'lucide-react'\n\nimport { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert'\nimport { Badge } from '@/components/ui/badge'\nimport { Button } from '@/components/ui/button'\nimport {\n  Card,\n  CardContent,\n  CardDescription,\n  CardHeader,\n  CardTitle,\n} from '@/components/ui/card'\nimport {\n  Collapsible,\n  CollapsibleContent,\n  CollapsibleTrigger,\n} from '@/components/ui/collapsible'\nimport {\n  Dialog,\n  DialogContent,\n  DialogDescription,\n  DialogFooter,\n  DialogHeader,\n  DialogTitle,\n} from '@/components/ui/dialog'\nimport { Label } from '@/components/ui/label'\nimport { ScrollArea } from '@/components/ui/scroll-area'\nimport { Separator } from '@/components/ui/separator'\nimport { Slider } from '@/components/ui/slider'\nimport { Switch } from '@/components/ui/switch'\nimport { Textarea } from '@/components/ui/textarea'\nimport { ToggleGroup, ToggleGroupItem } from '@/components/ui/toggle-group'\n\ntype AudioFormat = 'mp3' | 'wav' | 'pcm' | 'opus'\ntype LatencyMode = 'normal' | 'balanced'\n\nconst defaultInputText = `[excited, joyful tone] We're going to DISNEY WORLD! [squeal of delight] I've been saving for [emphasis] three years [breathless] and finally, FINALLY we can go! The look on your face right now is worth every extra shift I worked!\n[angry] After everything we've been through [break] I can't believe you would [emphasize] betray me like this. I gave you EVERYTHING! And now I'm left with nothing but memories and broken promises!`\n\n\ntype ControlsState = {\n  chunkLength: number\n  maxNewTokens: number\n  temperature: number\n  topP: number\n  repetitionPenalty: number\n  normalize: boolean\n  format: AudioFormat\n  latency: LatencyMode\n}\n\ntype Metrics = {\n  textLength: number\n  ttftMs: number\n  receivedKb: number\n}\n\ntype StatusState = {\n  tone: 'error' | 'info'\n  message: string\n}\n\ntype ReferenceItem = {\n  id: number\n  name: string\n  audio: ArrayBuffer\n  text: string\n  previewUrl: string\n}\n\ntype SpeakerGroup = {\n  id: number\n  references: ReferenceItem[]\n}\n\ntype PendingReference = {\n  mode: 'create' | 'edit'\n  speakerId: number\n  referenceId?: number\n  name: string\n  audio?: ArrayBuffer\n  text: string\n}\n\nconst initialControls: ControlsState = {\n  chunkLength: 1000,\n  maxNewTokens: 2048,\n  temperature: 0.9,\n  topP: 0.9,\n  repetitionPenalty: 1.05,\n  normalize: false,\n  format: 'mp3',\n  latency: 'normal',\n}\n\nconst formatMimeMap: Record<AudioFormat, string> = {\n  mp3: 'audio/mpeg',\n  wav: 'audio/wav',\n  pcm: 'audio/pcm',\n  opus: 'audio/opus',\n}\n\nfunction createId() {\n  return Date.now() + Math.floor(Math.random() * 100000)\n}\n\nfunction arrayBufferToBase64(buffer: ArrayBuffer): string {\n  const bytes = new Uint8Array(buffer)\n  let binary = ''\n  for (let i = 0; i < bytes.byteLength; i++) {\n    binary += String.fromCharCode(bytes[i])\n  }\n  return btoa(binary)\n}\n\nfunction createSpeakerGroup(): SpeakerGroup {\n  return {\n    id: createId(),\n    references: [],\n  }\n}\n\nconst initialSpeakerGroup = createSpeakerGroup()\n\nfunction buildReferencesPayload(\n  speakerGroups: SpeakerGroup[],\n  includeBinaryAudio: boolean,\n) {\n  return speakerGroups.flatMap((speakerGroup) =>\n    speakerGroup.references.map((reference) => ({\n      text: reference.text,\n      audio: includeBinaryAudio\n        ? arrayBufferToBase64(reference.audio)\n        : '<audio binary data>',\n    })),\n  )\n}\n\nfunction buildPreviewPayload(\n  inputText: string,\n  controls: ControlsState,\n  speakerGroups: SpeakerGroup[],\n) {\n  return {\n    text: inputText,\n    chunk_length: controls.chunkLength,\n    max_new_tokens: controls.maxNewTokens,\n    format: controls.format,\n    latency: controls.latency,\n    normalize: controls.normalize,\n    references: buildReferencesPayload(speakerGroups, false),\n    temperature: controls.temperature,\n    top_p: controls.topP,\n    repetition_penalty: controls.repetitionPenalty,\n  }\n}\n\nfunction buildRequestPayload(\n  inputText: string,\n  controls: ControlsState,\n  speakerGroups: SpeakerGroup[],\n) {\n  return {\n    text: inputText,\n    chunk_length: controls.chunkLength,\n    max_new_tokens: controls.maxNewTokens,\n    format: controls.format,\n    latency: controls.latency,\n    normalize: controls.normalize,\n    references: buildReferencesPayload(speakerGroups, true),\n    temperature: controls.temperature,\n    top_p: controls.topP,\n    repetition_penalty: controls.repetitionPenalty,\n  }\n}\n\nfunction createFileName(inputText: string) {\n  const safePrefix = inputText.trim().replace(/\\s+/g, '-').slice(0, 24) || 'tts'\n  return safePrefix\n}\n\nfunction getErrorMessage(error: unknown) {\n  return error instanceof Error ? error.message : 'Unknown error'\n}\n\nfunction waitForSourceBuffer(sourceBuffer: SourceBuffer) {\n  if (!sourceBuffer.updating) {\n    return Promise.resolve()\n  }\n\n  return new Promise<void>((resolve) => {\n    const handleUpdateEnd = () => {\n      sourceBuffer.removeEventListener('updateend', handleUpdateEnd)\n      resolve()\n    }\n\n    sourceBuffer.addEventListener('updateend', handleUpdateEnd)\n  })\n}\n\nfunction canUseStreamingPlayback(format: AudioFormat) {\n  const mime = formatMimeMap[format]\n  return typeof window.MediaSource !== 'undefined' && MediaSource.isTypeSupported(mime)\n}\n\ntype SettingSliderProps = {\n  label: string\n  value: number\n  min: number\n  max: number\n  step?: number\n  onValueChange: (value: number) => void\n  formatValue?: (value: number) => string\n}\n\nfunction SettingSlider({\n  label,\n  value,\n  min,\n  max,\n  step = 1,\n  onValueChange,\n  formatValue,\n}: SettingSliderProps) {\n  return (\n    <div className=\"space-y-3\">\n      <div className=\"flex items-center justify-between gap-4\">\n        <Label>{label}</Label>\n        <span className=\"text-sm text-muted-foreground\">\n          {formatValue ? formatValue(value) : value}\n        </span>\n      </div>\n      <Slider\n        value={[value]}\n        min={min}\n        max={max}\n        step={step}\n        onValueChange={(nextValue) => {\n          const current = nextValue[0]\n          if (typeof current === 'number') {\n            onValueChange(current)\n          }\n        }}\n      />\n    </div>\n  )\n}\n\nfunction App() {\n  const [inputText, setInputText] = useState(defaultInputText)\n  const [controls, setControls] = useState(initialControls)\n  const [speakerGroups, setSpeakerGroups] = useState<SpeakerGroup[]>([initialSpeakerGroup])\n  const [pendingReference, setPendingReference] = useState<PendingReference | null>(null)\n  const [openSpeakerIds, setOpenSpeakerIds] = useState<number[]>([initialSpeakerGroup.id])\n  const [metrics, setMetrics] = useState<Metrics | null>(null)\n  const [isGenerating, setIsGenerating] = useState(false)\n  const [copyLabel, setCopyLabel] = useState('Copy')\n  const [isRequestPreviewOpen, setIsRequestPreviewOpen] = useState(false)\n  const [statusMessage, setStatusMessage] = useState<StatusState | null>(null)\n  const [downloadUrl, setDownloadUrl] = useState<string | null>(null)\n  const [downloadName, setDownloadName] = useState('generated-audio.mp3')\n\n  const audioRef = useRef<HTMLAudioElement | null>(null)\n  const fileInputRef = useRef<HTMLInputElement | null>(null)\n  const speakerGroupsRef = useRef<SpeakerGroup[]>([])\n  const uploadTargetSpeakerIdRef = useRef<number | null>(null)\n  const downloadUrlRef = useRef<string | null>(null)\n  const mediaSourceUrlRef = useRef<string | null>(null)\n\n  speakerGroupsRef.current = speakerGroups\n\n  useEffect(() => {\n    return () => {\n      speakerGroupsRef.current.forEach((speakerGroup) => {\n        speakerGroup.references.forEach((reference) => {\n          URL.revokeObjectURL(reference.previewUrl)\n        })\n      })\n\n      if (downloadUrlRef.current) {\n        URL.revokeObjectURL(downloadUrlRef.current)\n      }\n\n      if (mediaSourceUrlRef.current) {\n        URL.revokeObjectURL(mediaSourceUrlRef.current)\n      }\n    }\n  }, [])\n\n  function addSpeaker() {\n    const nextSpeaker = createSpeakerGroup()\n    setSpeakerGroups((current) => [...current, nextSpeaker])\n    setOpenSpeakerIds((current) => [...current, nextSpeaker.id])\n  }\n\n  function removeSpeaker(speakerId: number) {\n    setSpeakerGroups((current) => {\n      const targetSpeaker = current.find((speakerGroup) => speakerGroup.id === speakerId)\n      if (targetSpeaker) {\n        targetSpeaker.references.forEach((reference) => {\n          URL.revokeObjectURL(reference.previewUrl)\n        })\n      }\n\n      const next = current.filter((speakerGroup) => speakerGroup.id !== speakerId)\n      return next.length > 0 ? next : [createSpeakerGroup()]\n    })\n    setOpenSpeakerIds((current) => current.filter((currentSpeakerId) => currentSpeakerId !== speakerId))\n\n    if (pendingReference?.speakerId === speakerId) {\n      setPendingReference(null)\n    }\n  }\n\n  function addReference(speakerId: number, name: string, audio: ArrayBuffer, text: string) {\n    const previewUrl = URL.createObjectURL(new Blob([audio], { type: formatMimeMap.mp3 }))\n\n    setSpeakerGroups((current) =>\n      current.map((speakerGroup) =>\n        speakerGroup.id === speakerId\n          ? {\n              ...speakerGroup,\n              references: [\n                ...speakerGroup.references,\n                {\n                  id: createId(),\n                  name,\n                  audio,\n                  text,\n                  previewUrl,\n                },\n              ],\n            }\n          : speakerGroup,\n      ),\n    )\n  }\n\n  function removeReference(speakerId: number, referenceId: number) {\n    setSpeakerGroups((current) =>\n      current.map((speakerGroup) => {\n        if (speakerGroup.id !== speakerId) {\n          return speakerGroup\n        }\n\n        return {\n          ...speakerGroup,\n          references: speakerGroup.references.filter((reference) => {\n            if (reference.id === referenceId) {\n              URL.revokeObjectURL(reference.previewUrl)\n              return false\n            }\n\n            return true\n          }),\n        }\n      }),\n    )\n  }\n\n  function updateReferenceText(speakerId: number, referenceId: number, text: string) {\n    setSpeakerGroups((current) =>\n      current.map((speakerGroup) =>\n        speakerGroup.id === speakerId\n          ? {\n              ...speakerGroup,\n              references: speakerGroup.references.map((reference) =>\n                reference.id === referenceId ? { ...reference, text } : reference,\n              ),\n            }\n          : speakerGroup,\n      ),\n    )\n  }\n\n  function clearDownloadUrl() {\n    if (downloadUrlRef.current) {\n      URL.revokeObjectURL(downloadUrlRef.current)\n      downloadUrlRef.current = null\n    }\n\n    setDownloadUrl(null)\n  }\n\n  function clearMediaSourceUrl() {\n    if (mediaSourceUrlRef.current) {\n      URL.revokeObjectURL(mediaSourceUrlRef.current)\n      mediaSourceUrlRef.current = null\n    }\n  }\n\n  async function handleReferenceUpload(event: React.ChangeEvent<HTMLInputElement>) {\n    const file = event.target.files?.[0]\n    const speakerId = uploadTargetSpeakerIdRef.current\n    event.target.value = ''\n    uploadTargetSpeakerIdRef.current = null\n\n    if (!file || typeof speakerId !== 'number') {\n      return\n    }\n\n    const audio = await file.arrayBuffer()\n    setPendingReference({\n      mode: 'create',\n      speakerId,\n      name: file.name,\n      audio,\n      text: '',\n    })\n  }\n\n  function savePendingReference() {\n    if (!pendingReference) {\n      return\n    }\n\n    if (pendingReference.mode === 'create' && pendingReference.audio) {\n      addReference(\n        pendingReference.speakerId,\n        pendingReference.name,\n        pendingReference.audio,\n        pendingReference.text,\n      )\n    }\n\n    if (pendingReference.mode === 'edit' && typeof pendingReference.referenceId === 'number') {\n      updateReferenceText(\n        pendingReference.speakerId,\n        pendingReference.referenceId,\n        pendingReference.text,\n      )\n    }\n\n    setPendingReference(null)\n    setStatusMessage(null)\n  }\n\n  async function copyRequestPreview() {\n    const requestPreview = JSON.stringify(\n      buildPreviewPayload(inputText, controls, speakerGroups),\n      null,\n      2,\n    )\n\n    try {\n      await navigator.clipboard.writeText(requestPreview)\n      setCopyLabel('Copied')\n      window.setTimeout(() => setCopyLabel('Copy'), 2000)\n    } catch (error) {\n      setStatusMessage({\n        tone: 'error',\n        message: `Failed to copy request preview: ${getErrorMessage(error)}`,\n      })\n    }\n  }\n\n  async function handleGenerateAudio() {\n    const audioElement = audioRef.current\n    if (!audioElement) {\n      return\n    }\n\n    const mime = formatMimeMap[controls.format]\n    const useStreamingPlayback = canUseStreamingPlayback(controls.format)\n\n    clearDownloadUrl()\n    clearMediaSourceUrl()\n    setMetrics(null)\n    setStatusMessage(null)\n    setIsGenerating(true)\n\n    try {\n      const response = await fetch('/v1/tts', {\n        method: 'POST',\n        headers: {\n          'Content-Type': 'application/json',\n        },\n        body: JSON.stringify(buildRequestPayload(inputText, controls, speakerGroups)),\n      })\n\n      if (!response.ok || !response.body) {\n        throw new Error('Failed to generate audio')\n      }\n\n      const reader = response.body.getReader()\n      let mediaSource: MediaSource | null = null\n\n      if (useStreamingPlayback) {\n        mediaSource = new MediaSource()\n        const streamUrl = URL.createObjectURL(mediaSource)\n        mediaSourceUrlRef.current = streamUrl\n        audioElement.src = streamUrl\n      } else {\n        audioElement.removeAttribute('src')\n        audioElement.load()\n      }\n\n      const allChunks: ArrayBuffer[] = []\n      const playQueue: ArrayBuffer[] = []\n      let sourceBuffer: SourceBuffer | null = null\n      let readingDone = false\n      let receivedLength = 0\n      let ttftMs = -1\n      const startTime = performance.now()\n\n      if (mediaSource) {\n        const sourceReady = new Promise<void>((resolve, reject) => {\n          mediaSource.addEventListener(\n            'sourceopen',\n            () => {\n              try {\n                sourceBuffer = mediaSource.addSourceBuffer(mime)\n\n                const processQueue = async () => {\n                  if (!sourceBuffer || !mediaSource) {\n                    return\n                  }\n\n                  while (true) {\n                    if (readingDone && playQueue.length === 0) {\n                      await waitForSourceBuffer(sourceBuffer)\n                      if (mediaSource.readyState === 'open') {\n                        mediaSource.endOfStream()\n                      }\n                      break\n                    }\n\n                    const chunk = playQueue.shift()\n                    if (!chunk) {\n                      await new Promise<void>((resolveSleep) => {\n                        window.setTimeout(resolveSleep, 50)\n                      })\n                      continue\n                    }\n\n                    await waitForSourceBuffer(sourceBuffer)\n                    sourceBuffer.appendBuffer(chunk)\n                    await waitForSourceBuffer(sourceBuffer)\n                  }\n                }\n\n                void processQueue()\n                resolve()\n              } catch (error) {\n                reject(error)\n              }\n            },\n            { once: true },\n          )\n        })\n\n        await sourceReady\n      }\n\n      while (true) {\n        const { done, value } = await reader.read()\n        if (done) {\n          readingDone = true\n          break\n        }\n\n        receivedLength += value.byteLength\n\n        if (ttftMs < 0) {\n          ttftMs = performance.now() - startTime\n        }\n\n        setMetrics({\n          textLength: inputText.length,\n          ttftMs,\n          receivedKb: Math.round(receivedLength / 1024),\n        })\n\n        const chunk = value.buffer.slice(value.byteOffset, value.byteOffset + value.byteLength)\n        playQueue.push(chunk)\n        allChunks.push(chunk)\n\n        if (useStreamingPlayback && audioElement.paused) {\n          void audioElement.play().catch(() => undefined)\n        }\n      }\n\n      const audioBlob = new Blob(allChunks, { type: mime })\n      const nextDownloadUrl = URL.createObjectURL(audioBlob)\n      downloadUrlRef.current = nextDownloadUrl\n      setDownloadUrl(nextDownloadUrl)\n      setDownloadName(`${createFileName(inputText)}.${controls.format}`)\n\n      if (!useStreamingPlayback) {\n        audioElement.src = nextDownloadUrl\n        audioElement.load()\n        setStatusMessage({\n          tone: 'info',\n          message: `Format \"${controls.format}\" is not supported for in-browser playback. The file is ready to download after generation completes.`,\n        })\n      }\n    } catch (error) {\n      setStatusMessage({\n        tone: 'error',\n        message: `Audio generation failed: ${getErrorMessage(error)}`,\n      })\n    } finally {\n      setIsGenerating(false)\n    }\n  }\n\n  const requestPreview = JSON.stringify(\n    buildPreviewPayload(inputText, controls, speakerGroups),\n    null,\n    2,\n  )\n\n  const totalReferenceCount = speakerGroups.reduce(\n    (count, speakerGroup) => count + speakerGroup.references.length,\n    0,\n  )\n\n  return (\n    <main className=\"min-h-screen bg-zinc-50\">\n      <div className=\"mx-auto max-w-[1600px] px-3 py-3 sm:px-4 lg:px-5\">\n        <div className=\"grid gap-4 xl:h-[calc(100vh-1.5rem)] xl:grid-cols-[minmax(0,1fr)_460px]\">\n          <section className=\"grid gap-4 xl:min-h-0 xl:grid-rows-[minmax(0,1fr)_auto]\">\n            <Card className=\"rounded-xl border-zinc-200 bg-white shadow-none xl:min-h-0 xl:flex xl:flex-col\">\n              <CardHeader className=\"space-y-1 border-b border-zinc-100 px-4 py-4\">\n                <div className=\"flex items-center gap-2 text-zinc-700\">\n                  <FileText className=\"size-4\" />\n                  <CardTitle>Input</CardTitle>\n                </div>\n                <CardDescription>\n                  Enter the text to synthesize and inspect the outgoing request payload.\n                </CardDescription>\n              </CardHeader>\n              <CardContent className=\"space-y-4 px-4 pt-4 xl:min-h-0 xl:flex-1 xl:overflow-y-auto\">\n                <div className=\"space-y-2\">\n                  <Label htmlFor=\"inputText\">Input Text</Label>\n                  <Textarea\n                    id=\"inputText\"\n                    value={inputText}\n                    onChange={(event) => setInputText(event.target.value)}\n                    placeholder=\"Enter text to synthesize\"\n                    className=\"min-h-[220px] resize-y rounded-xl border-zinc-200 bg-white p-3 text-sm shadow-none focus-visible:ring-zinc-300 xl:min-h-[260px]\"\n                  />\n                </div>\n\n                <Collapsible open={isRequestPreviewOpen} onOpenChange={setIsRequestPreviewOpen}>\n                  <div className=\"rounded-xl border border-zinc-200 bg-zinc-50\">\n                    <div className=\"flex flex-col gap-2 p-3 sm:flex-row sm:items-center sm:justify-between\">\n                      <div>\n                        <div className=\"text-sm font-medium text-zinc-900\">Request Preview</div>\n                        <div className=\"text-xs text-zinc-500\">\n                          Live snapshot of the payload sent to the backend.\n                        </div>\n                      </div>\n                      <div className=\"flex items-center gap-2\">\n                        <Button\n                          type=\"button\"\n                          variant=\"ghost\"\n                          size=\"sm\"\n                          className=\"border border-zinc-200 bg-white text-zinc-700 hover:bg-zinc-100\"\n                          onClick={copyRequestPreview}\n                        >\n                          <Copy className=\"size-3.5\" />\n                          {copyLabel}\n                        </Button>\n                        <CollapsibleTrigger asChild>\n                          <Button\n                            type=\"button\"\n                            variant=\"ghost\"\n                            size=\"sm\"\n                            className=\"border border-zinc-200 bg-white text-zinc-700 hover:bg-zinc-100\"\n                          >\n                            {isRequestPreviewOpen ? 'Collapse' : 'Expand'}\n                            <ChevronDown\n                              className={`size-4 transition-transform ${\n                                isRequestPreviewOpen ? 'rotate-180' : ''\n                              }`}\n                            />\n                          </Button>\n                        </CollapsibleTrigger>\n                      </div>\n                    </div>\n                    <CollapsibleContent>\n                      <Separator className=\"bg-zinc-200\" />\n                      <div className=\"p-3 pt-3\">\n                        <ScrollArea className=\"h-56 min-w-0 rounded-lg border border-zinc-200 bg-white\">\n                          <pre className=\"max-w-full whitespace-pre-wrap break-all p-3 text-xs leading-5 text-zinc-700\">\n                            {requestPreview}\n                          </pre>\n                        </ScrollArea>\n                      </div>\n                    </CollapsibleContent>\n                  </div>\n                </Collapsible>\n\n                <div className=\"space-y-4\">\n                  <Button\n                    type=\"button\"\n                    size=\"lg\"\n                    className=\"h-11 rounded-lg bg-zinc-900 text-white hover:bg-zinc-800\"\n                    onClick={handleGenerateAudio}\n                    disabled={isGenerating}\n                  >\n                    {isGenerating ? (\n                      <LoaderCircle className=\"size-4 animate-spin\" />\n                    ) : (\n                      <AudioLines className=\"size-4\" />\n                    )}\n                    {isGenerating ? 'Generating Audio...' : 'Generate Audio'}\n                  </Button>\n\n                  {statusMessage ? (\n                    <Alert\n                      variant={statusMessage.tone === 'error' ? 'destructive' : 'warning'}\n                      className=\"rounded-lg\"\n                    >\n                      <div className=\"flex items-start gap-3\">\n                        {statusMessage.tone === 'error' ? (\n                          <CircleAlert className=\"mt-0.5 size-4 shrink-0\" />\n                        ) : (\n                          <Info className=\"mt-0.5 size-4 shrink-0\" />\n                        )}\n                        <div>\n                          <AlertTitle>\n                            {statusMessage.tone === 'error' ? 'Error' : 'Notice'}\n                          </AlertTitle>\n                          <AlertDescription>{statusMessage.message}</AlertDescription>\n                        </div>\n                      </div>\n                    </Alert>\n                  ) : null}\n                </div>\n              </CardContent>\n            </Card>\n\n            <Card className=\"rounded-xl border-zinc-200 bg-white shadow-none\">\n              <CardHeader className=\"space-y-1 border-b border-zinc-100 px-4 py-4\">\n                <div className=\"flex items-center gap-2 text-zinc-700\">\n                  <AudioLines className=\"size-4\" />\n                  <CardTitle>Output</CardTitle>\n                </div>\n                <CardDescription>\n                  Stream the result when supported, then preview or download the final file.\n                </CardDescription>\n              </CardHeader>\n              <CardContent className=\"space-y-3 px-4 pt-4\">\n                <audio\n                  ref={audioRef}\n                  controls\n                  className=\"w-full rounded-lg border border-zinc-200 bg-white\"\n                />\n\n                <div className=\"flex flex-wrap gap-2\">\n                  {metrics ? (\n                    <>\n                      <Badge variant=\"outline\" className=\"border-zinc-200 bg-white text-zinc-700\">\n                        Text length: {metrics.textLength}\n                      </Badge>\n                      <Badge variant=\"outline\" className=\"border-zinc-200 bg-white text-zinc-700\">\n                        TTFT: {metrics.ttftMs.toFixed(2)} ms\n                      </Badge>\n                      <Badge variant=\"outline\" className=\"border-zinc-200 bg-white text-zinc-700\">\n                        Received: {metrics.receivedKb} KB\n                      </Badge>\n                    </>\n                  ) : (\n                    <Badge variant=\"outline\" className=\"border-zinc-200 bg-white text-zinc-500\">\n                      No output yet\n                    </Badge>\n                  )}\n                </div>\n\n                <div className=\"flex justify-end\">\n                  {downloadUrl ? (\n                    <Button\n                      asChild\n                      variant=\"outline\"\n                      className=\"border-zinc-200 bg-white text-zinc-800 hover:bg-zinc-100\"\n                    >\n                      <a href={downloadUrl} download={downloadName}>\n                        <Download className=\"size-4\" />\n                        Download\n                      </a>\n                    </Button>\n                  ) : null}\n                </div>\n              </CardContent>\n            </Card>\n          </section>\n\n          <aside className=\"grid gap-4 xl:min-h-0 xl:grid-rows-[minmax(0,1fr)_auto]\">\n            <Card className=\"rounded-xl border-zinc-200 bg-white shadow-none xl:min-h-0 xl:flex xl:flex-col\">\n              <CardHeader className=\"space-y-1 border-b border-zinc-100 px-4 py-4\">\n                <div className=\"flex items-center gap-2 text-zinc-700\">\n                  <Upload className=\"size-4\" />\n                  <CardTitle>Reference Audio</CardTitle>\n                </div>\n                <CardDescription>\n                  Build one or more speaker groups. Each speaker can have multiple reference clips.\n                </CardDescription>\n              </CardHeader>\n              <CardContent className=\"space-y-3 px-4 pt-4 xl:min-h-0 xl:flex xl:flex-1 xl:flex-col\">\n                <div className=\"flex flex-wrap items-center justify-between gap-2\">\n                  <div className=\"flex items-center text-sm text-zinc-500\">\n                    {speakerGroups.length} speaker{speakerGroups.length === 1 ? '' : 's'} /{' '}\n                    {totalReferenceCount} reference{totalReferenceCount === 1 ? '' : 's'}\n                  </div>\n                  <Button\n                    type=\"button\"\n                    variant=\"outline\"\n                    className=\"border-zinc-200 bg-white hover:bg-zinc-100\"\n                    onClick={addSpeaker}\n                  >\n                    <Plus className=\"size-4\" />\n                    Add Speaker\n                  </Button>\n                  <input\n                    ref={fileInputRef}\n                    type=\"file\"\n                    accept=\"audio/*\"\n                    className=\"hidden\"\n                    onChange={handleReferenceUpload}\n                  />\n                </div>\n\n                <ScrollArea className=\"min-h-0 rounded-md xl:h-full xl:flex-1\">\n                  <div className=\"space-y-2\">\n                    {speakerGroups.length > 0 ? (\n                      speakerGroups.map((speakerGroup, speakerIndex) => (\n                        <Collapsible\n                          key={speakerGroup.id}\n                          open={openSpeakerIds.includes(speakerGroup.id)}\n                          onOpenChange={(open) => {\n                            setOpenSpeakerIds((current) =>\n                              open\n                                ? [...current, speakerGroup.id]\n                                : current.filter(\n                                    (currentSpeakerId) => currentSpeakerId !== speakerGroup.id,\n                                  ),\n                            )\n                          }}\n                        >\n                          <div className=\"rounded-lg border border-zinc-200 bg-white\">\n                            <div className=\"flex flex-col gap-2 px-3 py-3 sm:flex-row sm:items-center sm:justify-between\">\n                              <div className=\"min-w-0\">\n                                <div className=\"text-sm font-medium text-zinc-900\">\n                                  Speaker {speakerIndex}\n                                </div>\n                                <div className=\"text-xs text-zinc-500\">\n                                  {speakerGroup.references.length} reference\n                                  {speakerGroup.references.length === 1 ? '' : 's'}\n                                </div>\n                              </div>\n                              <div className=\"flex flex-wrap gap-2\">\n                                <Button\n                                  type=\"button\"\n                                  variant=\"outline\"\n                                  size=\"sm\"\n                                  className=\"h-8 border-zinc-200 bg-white px-2.5 hover:bg-zinc-100\"\n                                  onClick={() => {\n                                    uploadTargetSpeakerIdRef.current = speakerGroup.id\n                                    fileInputRef.current?.click()\n                                  }}\n                                >\n                                  <Upload className=\"size-4\" />\n                                  Upload\n                                </Button>\n                                {speakerGroups.length > 1 ? (\n                                  <Button\n                                    type=\"button\"\n                                    variant=\"ghost\"\n                                    size=\"sm\"\n                                    className=\"h-8 px-2.5 text-zinc-500 hover:bg-zinc-100 hover:text-zinc-900\"\n                                    onClick={() => removeSpeaker(speakerGroup.id)}\n                                  >\n                                    Remove\n                                  </Button>\n                                ) : null}\n                                <CollapsibleTrigger asChild>\n                                  <Button\n                                    type=\"button\"\n                                    variant=\"ghost\"\n                                    size=\"sm\"\n                                    className=\"h-8 px-2 text-zinc-500 hover:bg-zinc-100 hover:text-zinc-900\"\n                                  >\n                                    <ChevronDown\n                                      className={`size-4 transition-transform ${\n                                        openSpeakerIds.includes(speakerGroup.id) ? 'rotate-180' : ''\n                                      }`}\n                                    />\n                                  </Button>\n                                </CollapsibleTrigger>\n                              </div>\n                            </div>\n\n                            <CollapsibleContent>\n                              <Separator className=\"bg-zinc-200\" />\n                              <div className=\"space-y-2 px-3 py-2.5\">\n                                {speakerGroup.references.length > 0 ? (\n                                  speakerGroup.references.map((reference) => (\n                                    <div\n                                      key={reference.id}\n                                      className=\"flex flex-col gap-2 rounded-md border border-zinc-200 bg-zinc-50 p-2 sm:flex-row sm:items-center\"\n                                    >\n                                      <audio\n                                        controls\n                                        src={reference.previewUrl}\n                                        className=\"h-9 w-full min-w-0 rounded-md border border-zinc-200 bg-white sm:flex-1\"\n                                      />\n                                      <div className=\"flex gap-2 sm:shrink-0\">\n                                        <Button\n                                          type=\"button\"\n                                          variant=\"ghost\"\n                                          size=\"sm\"\n                                          className=\"h-8 border border-zinc-200 bg-white px-2.5 text-zinc-600 hover:bg-zinc-100 hover:text-zinc-900\"\n                                          onClick={() =>\n                                            setPendingReference({\n                                              mode: 'edit',\n                                              speakerId: speakerGroup.id,\n                                              referenceId: reference.id,\n                                              name: reference.name,\n                                              text: reference.text,\n                                            })\n                                          }\n                                        >\n                                          Edit Text\n                                        </Button>\n                                        <Button\n                                          type=\"button\"\n                                          variant=\"ghost\"\n                                          size=\"sm\"\n                                          className=\"h-8 border border-zinc-200 bg-white px-2.5 text-zinc-500 hover:bg-zinc-100 hover:text-zinc-900\"\n                                          onClick={() =>\n                                            removeReference(speakerGroup.id, reference.id)\n                                          }\n                                        >\n                                          Remove\n                                        </Button>\n                                      </div>\n                                    </div>\n                                  ))\n                                ) : (\n                                  <div className=\"px-1 py-3 text-sm text-zinc-500\">\n                                    No references yet.\n                                  </div>\n                                )}\n                              </div>\n                            </CollapsibleContent>\n                          </div>\n                        </Collapsible>\n                      ))\n                    ) : (\n                      <div className=\"rounded-lg border border-dashed border-zinc-300 bg-white p-4 text-sm text-zinc-500\">\n                        No speaker groups configured yet.\n                      </div>\n                    )}\n                  </div>\n                </ScrollArea>\n              </CardContent>\n            </Card>\n\n            <Card className=\"rounded-xl border-zinc-200 bg-white shadow-none\">\n              <CardHeader className=\"space-y-1 border-b border-zinc-100 px-4 py-4\">\n                <div className=\"flex items-center gap-2 text-zinc-700\">\n                  <Settings2 className=\"size-4\" />\n                  <CardTitle>Generation Settings</CardTitle>\n                </div>\n                <CardDescription>Adjust sampling and output parameters.</CardDescription>\n              </CardHeader>\n              <CardContent className=\"space-y-4 px-4 pt-4\">\n                <div className=\"space-y-2\">\n                  <Label>Latency Mode</Label>\n                  <ToggleGroup\n                    type=\"single\"\n                    value={controls.latency}\n                    className=\"grid grid-cols-2 gap-2\"\n                    onValueChange={(value) => {\n                      if (value) {\n                        setControls((current) => ({\n                          ...current,\n                          latency: value as LatencyMode,\n                        }))\n                      }\n                    }}\n                  >\n                    <ToggleGroupItem value=\"balanced\" className=\"w-full\">\n                      balanced\n                    </ToggleGroupItem>\n                    <ToggleGroupItem value=\"normal\" className=\"w-full\">\n                      normal\n                    </ToggleGroupItem>\n                  </ToggleGroup>\n                  <p className=\"text-xs text-zinc-500\">\n                    Low uses incremental local decode for faster first audio. Normal waits for the\n                    full LLM result, then decodes once.\n                  </p>\n                </div>\n\n                <div className=\"space-y-2\">\n                  <Label>Format</Label>\n                  <ToggleGroup\n                    type=\"single\"\n                    value={controls.format}\n                    className=\"grid grid-cols-4 gap-2\"\n                    onValueChange={(value) => {\n                      if (value) {\n                        setControls((current) => ({\n                          ...current,\n                          format: value as AudioFormat,\n                        }))\n                      }\n                    }}\n                  >\n                    <ToggleGroupItem value=\"mp3\" className=\"w-full\">\n                      mp3\n                    </ToggleGroupItem>\n                    <ToggleGroupItem value=\"wav\" className=\"w-full\">\n                      wav\n                    </ToggleGroupItem>\n                    <ToggleGroupItem value=\"pcm\" className=\"w-full\">\n                      pcm\n                    </ToggleGroupItem>\n                    <ToggleGroupItem value=\"opus\" className=\"w-full\">\n                      opus\n                    </ToggleGroupItem>\n                  </ToggleGroup>\n                </div>\n\n                <div className=\"flex items-center justify-between rounded-lg border border-zinc-200 bg-zinc-50 px-3 py-2.5\">\n                  <div className=\"space-y-1\">\n                    <Label htmlFor=\"normalize\">Normalize</Label>\n                    <p className=\"text-xs text-zinc-500\">\n                      Normalize text before synthesis to keep input formatting consistent.\n                    </p>\n                  </div>\n                  <Switch\n                    id=\"normalize\"\n                    checked={controls.normalize}\n                    onCheckedChange={(checked) =>\n                      setControls((current) => ({\n                        ...current,\n                        normalize: checked,\n                      }))\n                    }\n                  />\n                </div>\n\n                <Separator className=\"bg-zinc-200\" />\n\n                <SettingSlider\n                  label=\"Chunk Length\"\n                  value={controls.chunkLength}\n                  min={100}\n                  max={1000}\n                  onValueChange={(value) =>\n                    setControls((current) => ({\n                      ...current,\n                      chunkLength: value,\n                    }))\n                  }\n                />\n                <SettingSlider\n                  label=\"Max New Tokens\"\n                  value={controls.maxNewTokens}\n                  min={256}\n                  max={2048}\n                  onValueChange={(value) =>\n                    setControls((current) => ({\n                      ...current,\n                      maxNewTokens: value,\n                    }))\n                  }\n                />\n                <SettingSlider\n                  label=\"Temperature\"\n                  value={controls.temperature}\n                  min={0.8}\n                  max={1}\n                  step={0.01}\n                  formatValue={(value) => value.toFixed(2)}\n                  onValueChange={(value) =>\n                    setControls((current) => ({\n                      ...current,\n                      temperature: value,\n                    }))\n                  }\n                />\n                <SettingSlider\n                  label=\"Top P\"\n                  value={controls.topP}\n                  min={0.8}\n                  max={1}\n                  step={0.01}\n                  formatValue={(value) => value.toFixed(2)}\n                  onValueChange={(value) =>\n                    setControls((current) => ({\n                      ...current,\n                      topP: value,\n                    }))\n                  }\n                />\n                <SettingSlider\n                  label=\"Repetition Penalty\"\n                  value={controls.repetitionPenalty}\n                  min={1}\n                  max={1.2}\n                  step={0.01}\n                  formatValue={(value) => value.toFixed(2)}\n                  onValueChange={(value) =>\n                    setControls((current) => ({\n                      ...current,\n                      repetitionPenalty: value,\n                    }))\n                  }\n                />\n              </CardContent>\n            </Card>\n          </aside>\n        </div>\n      </div>\n\n      <Dialog open={pendingReference !== null} onOpenChange={(open) => !open && setPendingReference(null)}>\n        <DialogContent className=\"border-zinc-200 bg-white\">\n          <DialogHeader>\n            <DialogTitle>\n              {pendingReference?.mode === 'create' ? 'Save Reference Text' : 'Edit Reference Text'}\n            </DialogTitle>\n            <DialogDescription>\n              {pendingReference\n                ? `Speaker ${speakerGroups.findIndex(\n                    (speakerGroup) => speakerGroup.id === pendingReference.speakerId,\n                  )}`\n                : ''}\n            </DialogDescription>\n          </DialogHeader>\n          <div className=\"space-y-3\">\n            <div className=\"text-sm font-medium text-zinc-900\">{pendingReference?.name}</div>\n            <Textarea\n              value={pendingReference?.text ?? ''}\n              onChange={(event) =>\n                setPendingReference((current) =>\n                  current\n                    ? {\n                        ...current,\n                        text: event.target.value,\n                      }\n                    : current,\n                )\n              }\n              placeholder=\"Enter reference text\"\n              className=\"min-h-40 rounded-lg border-zinc-200 bg-white shadow-none focus-visible:ring-zinc-300\"\n            />\n          </div>\n          <DialogFooter>\n            <Button type=\"button\" variant=\"ghost\" onClick={() => setPendingReference(null)}>\n              Cancel\n            </Button>\n            <Button\n              type=\"button\"\n              variant=\"outline\"\n              className=\"border-zinc-200 bg-white hover:bg-zinc-100\"\n              onClick={savePendingReference}\n            >\n              Save\n            </Button>\n          </DialogFooter>\n        </DialogContent>\n      </Dialog>\n    </main>\n  )\n}\n\nexport default App\n"
  },
  {
    "path": "awesome_webui/src/components/ui/alert.tsx",
    "content": "import * as React from 'react'\nimport { cva, type VariantProps } from 'class-variance-authority'\n\nimport { cn } from '@/lib/utils'\n\nconst alertVariants = cva('relative w-full rounded-lg border px-4 py-3 text-sm', {\n  variants: {\n    variant: {\n      default: 'bg-card text-card-foreground',\n      destructive: 'border-destructive/20 bg-destructive/5 text-destructive',\n      warning: 'border-amber-200 bg-amber-50 text-amber-900',\n    },\n  },\n  defaultVariants: {\n    variant: 'default',\n  },\n})\n\nfunction Alert({\n  className,\n  variant,\n  ...props\n}: React.ComponentProps<'div'> & VariantProps<typeof alertVariants>) {\n  return <div role=\"alert\" className={cn(alertVariants({ variant }), className)} {...props} />\n}\n\nfunction AlertTitle({ className, ...props }: React.ComponentProps<'h5'>) {\n  return <h5 className={cn('mb-1 font-medium leading-none tracking-tight', className)} {...props} />\n}\n\nfunction AlertDescription({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('text-sm [&_p]:leading-relaxed', className)} {...props} />\n}\n\nexport { Alert, AlertDescription, AlertTitle }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/badge.tsx",
    "content": "/* eslint-disable react-refresh/only-export-components */\nimport * as React from 'react'\nimport { cva, type VariantProps } from 'class-variance-authority'\n\nimport { cn } from '@/lib/utils'\n\nconst badgeVariants = cva(\n  'inline-flex items-center rounded-md border px-2 py-0.5 text-xs font-medium transition-colors',\n  {\n    variants: {\n      variant: {\n        default: 'border-transparent bg-primary text-primary-foreground',\n        secondary: 'border-transparent bg-secondary text-secondary-foreground',\n        outline: 'text-foreground',\n      },\n    },\n    defaultVariants: {\n      variant: 'default',\n    },\n  },\n)\n\nfunction Badge({\n  className,\n  variant,\n  ...props\n}: React.ComponentProps<'div'> & VariantProps<typeof badgeVariants>) {\n  return <div className={cn(badgeVariants({ variant }), className)} {...props} />\n}\n\nexport { Badge, badgeVariants }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/button.tsx",
    "content": "/* eslint-disable react-refresh/only-export-components */\nimport * as React from 'react'\nimport { Slot } from '@radix-ui/react-slot'\nimport { cva, type VariantProps } from 'class-variance-authority'\n\nimport { cn } from '@/lib/utils'\n\nconst buttonVariants = cva(\n  'inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-colors disabled:pointer-events-none disabled:opacity-50 outline-none focus-visible:ring-2 focus-visible:ring-ring/70 focus-visible:ring-offset-2 focus-visible:ring-offset-background',\n  {\n    variants: {\n      variant: {\n        default: 'bg-primary text-primary-foreground hover:bg-primary/90',\n        destructive: 'bg-destructive text-destructive-foreground hover:bg-destructive/90',\n        outline: 'border bg-card hover:bg-accent hover:text-accent-foreground',\n        secondary: 'bg-secondary text-secondary-foreground hover:bg-secondary/80',\n        ghost: 'hover:bg-accent hover:text-accent-foreground',\n      },\n      size: {\n        default: 'h-9 px-4 py-2',\n        sm: 'h-8 rounded-md px-3 text-xs',\n        lg: 'h-11 rounded-md px-6',\n        icon: 'size-9',\n      },\n    },\n    defaultVariants: {\n      variant: 'default',\n      size: 'default',\n    },\n  },\n)\n\ntype ButtonProps = React.ComponentProps<'button'> &\n  VariantProps<typeof buttonVariants> & {\n    asChild?: boolean\n  }\n\nfunction Button({ className, variant, size, asChild = false, ...props }: ButtonProps) {\n  const Comp = asChild ? Slot : 'button'\n\n  return <Comp className={cn(buttonVariants({ variant, size, className }))} {...props} />\n}\n\nexport { Button, buttonVariants }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/card.tsx",
    "content": "import * as React from 'react'\n\nimport { cn } from '@/lib/utils'\n\nfunction Card({ className, ...props }: React.ComponentProps<'div'>) {\n  return (\n    <div\n      data-slot=\"card\"\n      className={cn('rounded-xl border bg-card text-card-foreground shadow-sm', className)}\n      {...props}\n    />\n  )\n}\n\nfunction CardHeader({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('flex flex-col space-y-1.5 p-6', className)} {...props} />\n}\n\nfunction CardTitle({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('text-base font-semibold leading-none tracking-tight', className)} {...props} />\n}\n\nfunction CardDescription({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('text-sm text-muted-foreground', className)} {...props} />\n}\n\nfunction CardContent({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('p-6 pt-0', className)} {...props} />\n}\n\nexport { Card, CardContent, CardDescription, CardHeader, CardTitle }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/collapsible.tsx",
    "content": "import * as CollapsiblePrimitive from '@radix-ui/react-collapsible'\n\nconst Collapsible = CollapsiblePrimitive.Root\nconst CollapsibleTrigger = CollapsiblePrimitive.CollapsibleTrigger\nconst CollapsibleContent = CollapsiblePrimitive.CollapsibleContent\n\nexport { Collapsible, CollapsibleContent, CollapsibleTrigger }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/dialog.tsx",
    "content": "import * as React from 'react'\nimport * as DialogPrimitive from '@radix-ui/react-dialog'\nimport { X } from 'lucide-react'\n\nimport { cn } from '@/lib/utils'\n\nconst Dialog = DialogPrimitive.Root\nconst DialogTrigger = DialogPrimitive.Trigger\nconst DialogPortal = DialogPrimitive.Portal\nconst DialogClose = DialogPrimitive.Close\n\nfunction DialogOverlay({\n  className,\n  ...props\n}: React.ComponentProps<typeof DialogPrimitive.Overlay>) {\n  return (\n    <DialogPrimitive.Overlay\n      className={cn('fixed inset-0 z-50 bg-black/40', className)}\n      {...props}\n    />\n  )\n}\n\nfunction DialogContent({\n  className,\n  children,\n  ...props\n}: React.ComponentProps<typeof DialogPrimitive.Content>) {\n  return (\n    <DialogPortal>\n      <DialogOverlay />\n      <DialogPrimitive.Content\n        className={cn(\n          'fixed left-1/2 top-1/2 z-50 grid w-full max-w-lg -translate-x-1/2 -translate-y-1/2 gap-4 rounded-xl border bg-background p-6 shadow-lg duration-200',\n          className,\n        )}\n        {...props}\n      >\n        {children}\n        <DialogClose className=\"absolute right-4 top-4 rounded-sm opacity-70 transition-opacity hover:opacity-100 focus-visible:ring-2 focus-visible:ring-ring/70\">\n          <X className=\"size-4\" />\n          <span className=\"sr-only\">Close</span>\n        </DialogClose>\n      </DialogPrimitive.Content>\n    </DialogPortal>\n  )\n}\n\nfunction DialogHeader({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('flex flex-col space-y-1.5 text-left', className)} {...props} />\n}\n\nfunction DialogFooter({ className, ...props }: React.ComponentProps<'div'>) {\n  return <div className={cn('flex flex-col-reverse gap-2 sm:flex-row sm:justify-end', className)} {...props} />\n}\n\nfunction DialogTitle({ className, ...props }: React.ComponentProps<typeof DialogPrimitive.Title>) {\n  return (\n    <DialogPrimitive.Title\n      className={cn('text-lg font-semibold leading-none tracking-tight', className)}\n      {...props}\n    />\n  )\n}\n\nfunction DialogDescription({\n  className,\n  ...props\n}: React.ComponentProps<typeof DialogPrimitive.Description>) {\n  return (\n    <DialogPrimitive.Description\n      className={cn('text-sm text-muted-foreground', className)}\n      {...props}\n    />\n  )\n}\n\nexport {\n  Dialog,\n  DialogContent,\n  DialogDescription,\n  DialogFooter,\n  DialogHeader,\n  DialogTitle,\n  DialogTrigger,\n}\n"
  },
  {
    "path": "awesome_webui/src/components/ui/label.tsx",
    "content": "import * as React from 'react'\nimport * as LabelPrimitive from '@radix-ui/react-label'\n\nimport { cn } from '@/lib/utils'\n\nfunction Label({ className, ...props }: React.ComponentProps<typeof LabelPrimitive.Root>) {\n  return (\n    <LabelPrimitive.Root\n      className={cn('text-sm font-medium leading-none', className)}\n      {...props}\n    />\n  )\n}\n\nexport { Label }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/scroll-area.tsx",
    "content": "import * as React from 'react'\nimport * as ScrollAreaPrimitive from '@radix-ui/react-scroll-area'\n\nimport { cn } from '@/lib/utils'\n\nfunction ScrollArea({\n  className,\n  children,\n  ...props\n}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {\n  return (\n    <ScrollAreaPrimitive.Root className={cn('relative overflow-hidden', className)} {...props}>\n      <ScrollAreaPrimitive.Viewport className=\"h-full w-full rounded-[inherit]\">\n        {children}\n      </ScrollAreaPrimitive.Viewport>\n      <ScrollBar />\n      <ScrollAreaPrimitive.Corner />\n    </ScrollAreaPrimitive.Root>\n  )\n}\n\nfunction ScrollBar({\n  className,\n  orientation = 'vertical',\n  ...props\n}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {\n  return (\n    <ScrollAreaPrimitive.ScrollAreaScrollbar\n      orientation={orientation}\n      className={cn(\n        'flex touch-none select-none p-px transition-colors',\n        orientation === 'vertical' && 'h-full w-2.5 border-l border-l-transparent',\n        orientation === 'horizontal' && 'h-2.5 flex-col border-t border-t-transparent',\n        className,\n      )}\n      {...props}\n    >\n      <ScrollAreaPrimitive.ScrollAreaThumb className=\"relative flex-1 rounded-full bg-border\" />\n    </ScrollAreaPrimitive.ScrollAreaScrollbar>\n  )\n}\n\nexport { ScrollArea, ScrollBar }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/separator.tsx",
    "content": "import * as React from 'react'\nimport * as SeparatorPrimitive from '@radix-ui/react-separator'\n\nimport { cn } from '@/lib/utils'\n\nfunction Separator({\n  className,\n  orientation = 'horizontal',\n  decorative = true,\n  ...props\n}: React.ComponentProps<typeof SeparatorPrimitive.Root>) {\n  return (\n    <SeparatorPrimitive.Root\n      decorative={decorative}\n      orientation={orientation}\n      className={cn(\n        'shrink-0 bg-border',\n        orientation === 'horizontal' ? 'h-px w-full' : 'h-full w-px',\n        className,\n      )}\n      {...props}\n    />\n  )\n}\n\nexport { Separator }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/slider.tsx",
    "content": "import * as React from 'react'\nimport * as SliderPrimitive from '@radix-ui/react-slider'\n\nimport { cn } from '@/lib/utils'\n\nfunction Slider({\n  className,\n  ...props\n}: React.ComponentProps<typeof SliderPrimitive.Root>) {\n  return (\n    <SliderPrimitive.Root\n      className={cn('relative flex w-full touch-none select-none items-center', className)}\n      {...props}\n    >\n      <SliderPrimitive.Track className=\"relative h-1.5 w-full grow overflow-hidden rounded-full bg-muted\">\n        <SliderPrimitive.Range className=\"absolute h-full bg-primary\" />\n      </SliderPrimitive.Track>\n      <SliderPrimitive.Thumb className=\"block size-4 rounded-full border border-primary/20 bg-background shadow-sm transition-colors focus-visible:ring-2 focus-visible:ring-ring/70 focus-visible:ring-offset-2 focus-visible:ring-offset-background disabled:pointer-events-none disabled:opacity-50\" />\n    </SliderPrimitive.Root>\n  )\n}\n\nexport { Slider }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/switch.tsx",
    "content": "import * as React from 'react'\nimport * as SwitchPrimitive from '@radix-ui/react-switch'\n\nimport { cn } from '@/lib/utils'\n\nfunction Switch({\n  className,\n  ...props\n}: React.ComponentProps<typeof SwitchPrimitive.Root>) {\n  return (\n    <SwitchPrimitive.Root\n      className={cn(\n        'peer inline-flex h-6 w-11 shrink-0 cursor-pointer items-center rounded-full border border-transparent bg-input shadow-xs transition-colors outline-none focus-visible:ring-2 focus-visible:ring-ring/70 focus-visible:ring-offset-2 focus-visible:ring-offset-background data-[state=checked]:bg-primary data-[state=unchecked]:bg-muted-foreground/30 disabled:cursor-not-allowed disabled:opacity-50',\n        className,\n      )}\n      {...props}\n    >\n      <SwitchPrimitive.Thumb\n        className={cn(\n          'pointer-events-none block size-5 rounded-full bg-background shadow-sm ring-0 transition-transform data-[state=checked]:translate-x-5 data-[state=unchecked]:translate-x-0',\n        )}\n      />\n    </SwitchPrimitive.Root>\n  )\n}\n\nexport { Switch }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/textarea.tsx",
    "content": "import * as React from 'react'\n\nimport { cn } from '@/lib/utils'\n\nfunction Textarea({ className, ...props }: React.ComponentProps<'textarea'>) {\n  return (\n    <textarea\n      className={cn(\n        'flex min-h-16 w-full rounded-lg border border-input bg-background px-3 py-2 text-sm shadow-xs outline-none transition-[color,box-shadow] placeholder:text-muted-foreground focus-visible:ring-2 focus-visible:ring-ring/70 disabled:cursor-not-allowed disabled:opacity-50',\n        className,\n      )}\n      {...props}\n    />\n  )\n}\n\nexport { Textarea }\n"
  },
  {
    "path": "awesome_webui/src/components/ui/toggle-group.tsx",
    "content": "import * as React from 'react'\nimport * as ToggleGroupPrimitive from '@radix-ui/react-toggle-group'\nimport { cva, type VariantProps } from 'class-variance-authority'\n\nimport { cn } from '@/lib/utils'\n\nconst toggleGroupItemVariants = cva(\n  'inline-flex items-center justify-center rounded-md text-sm font-medium transition-colors hover:bg-accent hover:text-accent-foreground focus-visible:ring-2 focus-visible:ring-ring/70 focus-visible:ring-offset-2 focus-visible:ring-offset-background disabled:pointer-events-none disabled:opacity-50 data-[state=on]:bg-primary data-[state=on]:text-primary-foreground border border-border bg-card',\n  {\n    variants: {\n      size: {\n        default: 'h-9 px-3',\n        sm: 'h-8 px-2.5 text-xs',\n        lg: 'h-10 px-4',\n      },\n    },\n    defaultVariants: {\n      size: 'default',\n    },\n  },\n)\n\nfunction ToggleGroup({\n  className,\n  ...props\n}: React.ComponentProps<typeof ToggleGroupPrimitive.Root>) {\n  return (\n    <ToggleGroupPrimitive.Root\n      className={cn('flex items-center gap-2', className)}\n      {...props}\n    />\n  )\n}\n\nfunction ToggleGroupItem({\n  className,\n  size,\n  ...props\n}: React.ComponentProps<typeof ToggleGroupPrimitive.Item> &\n  VariantProps<typeof toggleGroupItemVariants>) {\n  return (\n    <ToggleGroupPrimitive.Item\n      className={cn(toggleGroupItemVariants({ size }), className)}\n      {...props}\n    />\n  )\n}\n\nexport { ToggleGroup, ToggleGroupItem }\n"
  },
  {
    "path": "awesome_webui/src/index.css",
    "content": "@import \"tailwindcss\";\n\n:root {\n  --background: 0 0% 96%;\n  --foreground: 240 10% 3.9%;\n  --card: 0 0% 100%;\n  --card-foreground: 240 10% 3.9%;\n  --popover: 0 0% 100%;\n  --popover-foreground: 240 10% 3.9%;\n  --primary: 240 5.9% 10%;\n  --primary-foreground: 0 0% 98%;\n  --secondary: 240 4.8% 95.9%;\n  --secondary-foreground: 240 5.9% 10%;\n  --muted: 240 4.8% 95.9%;\n  --muted-foreground: 240 3.8% 46.1%;\n  --accent: 240 4.8% 95.9%;\n  --accent-foreground: 240 5.9% 10%;\n  --destructive: 0 72.2% 50.6%;\n  --destructive-foreground: 0 0% 98%;\n  --border: 240 5.9% 88%;\n  --input: 240 5.9% 88%;\n  --ring: 240 5% 64.9%;\n  --radius: 0.75rem;\n}\n\n@theme inline {\n  --color-background: hsl(var(--background));\n  --color-foreground: hsl(var(--foreground));\n  --color-card: hsl(var(--card));\n  --color-card-foreground: hsl(var(--card-foreground));\n  --color-popover: hsl(var(--popover));\n  --color-popover-foreground: hsl(var(--popover-foreground));\n  --color-primary: hsl(var(--primary));\n  --color-primary-foreground: hsl(var(--primary-foreground));\n  --color-secondary: hsl(var(--secondary));\n  --color-secondary-foreground: hsl(var(--secondary-foreground));\n  --color-muted: hsl(var(--muted));\n  --color-muted-foreground: hsl(var(--muted-foreground));\n  --color-accent: hsl(var(--accent));\n  --color-accent-foreground: hsl(var(--accent-foreground));\n  --color-destructive: hsl(var(--destructive));\n  --color-destructive-foreground: hsl(var(--destructive-foreground));\n  --color-border: hsl(var(--border));\n  --color-input: hsl(var(--input));\n  --color-ring: hsl(var(--ring));\n  --radius-sm: calc(var(--radius) - 4px);\n  --radius-md: calc(var(--radius) - 2px);\n  --radius-lg: var(--radius);\n  --radius-xl: calc(var(--radius) + 4px);\n}\n\n@layer base {\n  * {\n    @apply border-border;\n  }\n\n  html {\n    min-width: 320px;\n  }\n\n  body {\n    @apply bg-background text-foreground antialiased;\n    font-family: \"Inter\", \"Avenir Next\", \"Segoe UI\", sans-serif;\n  }\n\n  button,\n  input,\n  textarea {\n    font: inherit;\n  }\n}\n"
  },
  {
    "path": "awesome_webui/src/main.tsx",
    "content": "import { StrictMode } from 'react'\nimport { createRoot } from 'react-dom/client'\nimport './index.css'\nimport App from './App.tsx'\n\ncreateRoot(document.getElementById('root')!).render(\n  <StrictMode>\n    <App />\n  </StrictMode>,\n)\n"
  },
  {
    "path": "awesome_webui/tsconfig.app.json",
    "content": "{\n  \"compilerOptions\": {\n    \"tsBuildInfoFile\": \"./node_modules/.tmp/tsconfig.app.tsbuildinfo\",\n    \"target\": \"ES2022\",\n    \"useDefineForClassFields\": true,\n    \"lib\": [\n      \"ES2022\",\n      \"DOM\",\n      \"DOM.Iterable\"\n    ],\n    \"module\": \"ESNext\",\n    \"types\": [\n      \"vite/client\"\n    ],\n    \"skipLibCheck\": true,\n    \"baseUrl\": \".\",\n    \"paths\": {\n      \"@/*\": [\n        \"./src/*\"\n      ]\n    },\n    \"moduleResolution\": \"bundler\",\n    \"allowImportingTsExtensions\": true,\n    \"verbatimModuleSyntax\": true,\n    \"moduleDetection\": \"force\",\n    \"noEmit\": true,\n    \"jsx\": \"react-jsx\",\n    \"strict\": true,\n    \"noUnusedLocals\": true,\n    \"noUnusedParameters\": true,\n    \"erasableSyntaxOnly\": true,\n    \"noFallthroughCasesInSwitch\": true,\n    \"noUncheckedSideEffectImports\": true\n  },\n  \"include\": [\n    \"src\"\n  ]\n}\n"
  },
  {
    "path": "awesome_webui/tsconfig.json",
    "content": "{\n  \"files\": [],\n  \"references\": [\n    { \"path\": \"./tsconfig.app.json\" },\n    { \"path\": \"./tsconfig.node.json\" }\n  ]\n}\n"
  },
  {
    "path": "awesome_webui/tsconfig.node.json",
    "content": "{\n  \"compilerOptions\": {\n    \"tsBuildInfoFile\": \"./node_modules/.tmp/tsconfig.node.tsbuildinfo\",\n    \"target\": \"ES2023\",\n    \"lib\": [\n      \"ES2023\"\n    ],\n    \"module\": \"ESNext\",\n    \"types\": [\n      \"node\"\n    ],\n    \"skipLibCheck\": true,\n    \"baseUrl\": \".\",\n    \"paths\": {\n      \"@/*\": [\n        \"./src/*\"\n      ]\n    },\n    \"moduleResolution\": \"bundler\",\n    \"allowImportingTsExtensions\": true,\n    \"verbatimModuleSyntax\": true,\n    \"moduleDetection\": \"force\",\n    \"noEmit\": true,\n    \"strict\": true,\n    \"noUnusedLocals\": true,\n    \"noUnusedParameters\": true,\n    \"erasableSyntaxOnly\": true,\n    \"noFallthroughCasesInSwitch\": true,\n    \"noUncheckedSideEffectImports\": true\n  },\n  \"include\": [\n    \"vite.config.ts\"\n  ]\n}\n"
  },
  {
    "path": "awesome_webui/vite.config.ts",
    "content": "import fs from 'node:fs'\nimport { defineConfig, type Plugin } from 'vite'\nimport react from '@vitejs/plugin-react-swc'\nimport tailwindcss from '@tailwindcss/vite'\nimport path from 'node:path'\n\nfunction inlineEntryAssets(): Plugin {\n  let resolvedOutDir = ''\n\n  return {\n    name: 'inline-entry-assets',\n    apply: 'build',\n    configResolved(config) {\n      resolvedOutDir = path.resolve(config.root, config.build.outDir)\n    },\n    closeBundle() {\n      const indexHtmlPath = path.join(resolvedOutDir, 'index.html')\n      if (!fs.existsSync(indexHtmlPath)) {\n        return\n      }\n\n      const filesToDelete = new Set<string>()\n      const escapeInlineScript = (code: string) => code.replace(/<\\/script/gi, '<\\\\/script')\n      const escapeInlineStyle = (code: string) => code.replace(/<\\/style/gi, '<\\\\/style')\n      const normalizeFileName = (assetPath: string) =>\n        assetPath.replace(/^\\//, '').replace(/^\\.\\//, '')\n      const readBuiltAsset = (assetPath: string) => {\n        const fileName = normalizeFileName(assetPath)\n        const absolutePath = path.join(resolvedOutDir, fileName)\n        if (!fs.existsSync(absolutePath)) {\n          return null\n        }\n\n        filesToDelete.add(absolutePath)\n        return fs.readFileSync(absolutePath, 'utf8')\n      }\n\n      let html = fs.readFileSync(indexHtmlPath, 'utf8')\n\n      html = html.replace(\n        /<link rel=\"modulepreload\"[^>]+href=\"([^\"]+)\"[^>]*>/g,\n        (_fullMatch, href: string) => {\n          const absolutePath = path.join(resolvedOutDir, normalizeFileName(href))\n          if (fs.existsSync(absolutePath)) {\n            filesToDelete.add(absolutePath)\n          }\n          return ''\n        },\n      )\n\n      html = html.replace(\n        /<link rel=\"stylesheet\"[^>]+href=\"([^\"]+)\"[^>]*>/g,\n        (fullMatch, href: string) => {\n          const assetSource = readBuiltAsset(href)\n          if (!assetSource) {\n            return fullMatch\n          }\n\n          return `<style>${escapeInlineStyle(assetSource)}</style>`\n        },\n      )\n\n      html = html.replace(\n        /<script type=\"module\"[^>]+src=\"([^\"]+)\"[^>]*><\\/script>/g,\n        (fullMatch, src: string) => {\n          const chunkCode = readBuiltAsset(src)\n          if (!chunkCode) {\n            return fullMatch\n          }\n\n          return `<script type=\"module\">${escapeInlineScript(chunkCode)}</script>`\n        },\n      )\n\n      fs.writeFileSync(indexHtmlPath, html)\n\n      for (const filePath of filesToDelete) {\n        fs.rmSync(filePath, { force: true })\n      }\n\n      fs.rmSync(path.join(resolvedOutDir, 'vite.svg'), { force: true })\n      fs.rmSync(path.join(resolvedOutDir, 'assets'), { recursive: true, force: true })\n    },\n  }\n}\n\n// https://vite.dev/config/\nexport default defineConfig({\n  plugins: [react(), tailwindcss(), inlineEntryAssets()],\n  publicDir: false,\n  resolve: {\n    alias: {\n      '@': path.resolve(__dirname, './src'),\n    },\n  },\n  build: {\n    assetsInlineLimit: Number.MAX_SAFE_INTEGER,\n    cssCodeSplit: false,\n    modulePreload: false,\n    rollupOptions: {\n      output: {\n        inlineDynamicImports: true,\n      },\n    },\n  },\n  server: {\n    proxy: {\n      '/v1': 'http://localhost:8888',\n      '/v2': 'http://localhost:8888',\n      '/health': 'http://localhost:8888',\n    },\n  },\n})\n"
  },
  {
    "path": "compose.base.yml",
    "content": "services:\n  app-base:\n    build:\n      context: .\n      dockerfile: docker/Dockerfile\n      args:\n        BACKEND: ${BACKEND:-cuda}     # or cpu\n        UV_VERSION: ${UV_VERSION:-0.8.15}\n    volumes:\n      - ./checkpoints:/app/checkpoints\n      - ./references:/app/references\n    environment:\n      COMPILE: ${COMPILE:-0}\n    # GPU (remove this block if CPU-only):\n    deploy:\n      resources:\n        reservations:\n          devices:\n            - driver: nvidia\n              count: all\n              capabilities: [gpu]\n    tty: true\n    stdin_open: true\n"
  },
  {
    "path": "compose.yml",
    "content": "name: fish-speech\n\nservices:\n  webui:\n    extends:\n      file: compose.base.yml\n      service: app-base\n    build:\n      target: webui\n    environment:\n      COMPILE: ${COMPILE:-0}\n    profiles: [\"webui\"]\n    ports:\n      - \"${GRADIO_PORT:-7860}:7860\"\n\n  server:\n    extends:\n      file: compose.base.yml\n      service: app-base\n    build:\n      target: server\n    environment:\n      COMPILE: ${COMPILE:-0}\n    profiles: [\"server\"]\n    ports:\n      - \"${API_PORT:-8080}:8080\"\n"
  },
  {
    "path": "docker/Dockerfile",
    "content": "# docker/Dockerfile\n\n# IMPORTANT: The docker images do not contain the checkpoints. You need to mount the checkpoints to the container.\n\n# Build the image:\n#   docker build \\\n#       --platform linux/amd64 \\\n#       -f docker/Dockerfile \\\n#       --build-arg BACKEND=[cuda, cpu] \\\n#       --target [webui, server] \\\n#       -t fish-speech-[webui, server]:[cuda, cpu] .\n\n# e.g. for building the webui:\n#   docker build \\\n#       --platform linux/amd64 \\\n#       -f docker/Dockerfile \\\n#       --build-arg BACKEND=cuda \\\n#       --target webui \\\n#       -t fish-speech-webui:cuda .\n\n# e.g. for building the server:\n#   docker build \\\n#       --platform linux/amd64 \\\n#       -f docker/Dockerfile \\\n#       --build-arg BACKEND=cuda \\\n#       --target server \\\n#       -t fish-speech-server:cuda .\n\n\n\n# Multi-platform build:\n#   docker buildx build \\\n#       --platform linux/amd64,linux/arm64 \\\n#       -f docker/Dockerfile \\\n#       --build-arg BACKEND=cpu \\\n#       --target webui \\\n#       -t fish-speech-webui:cpu .\n\n\n# Running the image interactively:\n#   docker run \\\n#       --gpus all \\\n#       -v /path/to/fish-speech/checkpoints:/app/checkpoints \\\n#       -e COMPILE=1 \\ ... or -e COMPILE=0 \\\n#       -it fish-speech-[webui, server]:[cuda, cpu]\n\n# E.g. running the webui:\n#   docker run \\\n#       --gpus all \\\n#       -v ./checkpoints:/app/checkpoints \\\n#       -e COMPILE=1 \\\n#       -p 7860:7860 \\\n#       fish-speech-webui:cuda\n\n# E.g. running the server:\n#   docker run \\\n#       --gpus all \\\n#       -v ./checkpoints:/app/checkpoints \\\n#       -p 8080:8080 \\\n#       -it fish-speech-server:cuda\n\n\n# Select the specific cuda version (see https://hub.docker.com/r/nvidia/cuda/)\nARG CUDA_VER=12.6.0\n# Adapt the uv extra to fit the cuda version (one of [cu126, cu128, cu129])\nARG UV_EXTRA=cu126\nARG BACKEND=cuda\n\nARG UBUNTU_VER=24.04\nARG PY_VER=3.12\nARG UV_VERSION=0.8.15\n\n# Create non-root user early for security\nARG USERNAME=fish\nARG USER_UID=1000\nARG USER_GID=1000\n\n##############################################################\n# Base stage per backend\n##############################################################\n\n# --- CUDA (x86_64) ---\nFROM nvidia/cuda:${CUDA_VER}-cudnn-runtime-ubuntu${UBUNTU_VER} AS base-cuda\nENV DEBIAN_FRONTEND=noninteractive\n\n# Install system dependencies in a single layer with cleanup\nRUN --mount=type=cache,target=/var/cache/apt,sharing=locked \\\n    --mount=type=cache,target=/var/lib/apt,sharing=locked \\\n    set -eux \\\n    && rm -f /etc/apt/apt.conf.d/docker-clean \\\n    && echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' >/etc/apt/apt.conf.d/keep-cache \\\n    && apt-get update \\\n    && apt-get install -y --no-install-recommends \\\n        python3-pip \\\n        python3-dev \\\n        git \\\n        ca-certificates \\\n        curl \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\n# --- CPU-only (portable x86_64) ---\nFROM python:${PY_VER}-slim AS base-cpu\nENV UV_EXTRA=cpu\n\n# Install system dependencies in a single layer with cleanup\nRUN --mount=type=cache,target=/var/cache/apt,sharing=locked \\\n    --mount=type=cache,target=/var/lib/apt,sharing=locked \\\n    set -eux \\\n    && rm -f /etc/apt/apt.conf.d/docker-clean \\\n    && echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' >/etc/apt/apt.conf.d/keep-cache \\\n    && apt-get update \\\n    && apt-get install -y --no-install-recommends \\\n        git \\\n        ca-certificates \\\n        curl \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\n\n##############################################################\n# UV stage\n##############################################################\n\nARG UV_VERSION\nFROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv-bin\n\n##############################################################\n# Shared app base stage\n##############################################################\n\nFROM base-${BACKEND} AS app-base\n\nARG PY_VER\nARG BACKEND\nARG USERNAME\nARG USER_UID\nARG USER_GID\nARG UV_VERSION\nARG UV_EXTRA\n\nENV BACKEND=${BACKEND} \\\n    DEBIAN_FRONTEND=noninteractive \\\n    PYTHONDONTWRITEBYTECODE=1 \\\n    PYTHONUNBUFFERED=1\n\n# System dependencies for audio processing\nARG DEPENDENCIES=\" \\\n    libsox-dev \\\n    build-essential \\\n    cmake \\\n    libasound-dev \\\n    portaudio19-dev \\\n    libportaudio2 \\\n    libportaudiocpp0 \\\n    ffmpeg\"\n\n# Install system dependencies with caching and cleanup\nRUN --mount=type=cache,target=/var/cache/apt,sharing=locked \\\n    --mount=type=cache,target=/var/lib/apt,sharing=locked \\\n    set -eux \\\n    && rm -f /etc/apt/apt.conf.d/docker-clean \\\n    && echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' >/etc/apt/apt.conf.d/keep-cache \\\n    && apt-get update \\\n    && apt-get install -y --no-install-recommends ${DEPENDENCIES} \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\n# Install specific uv version\nCOPY --from=uv-bin /uv /uvx /bin/\n\n# RUN groupadd --gid ${USER_GID} ${USERNAME} \\\n#     && useradd --uid ${USER_UID} --gid ${USER_GID} -m ${USERNAME} \\\n#     && mkdir -p /app /home/${USERNAME}/.cache \\\n#     && chown -R ${USERNAME}:${USERNAME} /app /home/${USERNAME}/.cache\n\n# Create non-root user (or use existing user)\nRUN set -eux; \\\n    if getent group ${USER_GID} >/dev/null 2>&1; then \\\n        echo \"Group ${USER_GID} already exists\"; \\\n    else \\\n        groupadd -g ${USER_GID} ${USERNAME}; \\\n    fi; \\\n    if id -u ${USER_UID} >/dev/null 2>&1; then \\\n        echo \"User ${USER_UID} already exists, using existing user\"; \\\n        EXISTING_USER=$(id -un ${USER_UID}); \\\n        mkdir -p /app /home/${EXISTING_USER}/.cache; \\\n        chown -R ${USER_UID}:${USER_GID} /app /home/${EXISTING_USER}/.cache; \\\n    else \\\n        useradd -m -u ${USER_UID} -g ${USER_GID} ${USERNAME}; \\\n        mkdir -p /app /home/${USERNAME}/.cache; \\\n        chown -R ${USERNAME}:${USERNAME} /app /home/${USERNAME}/.cache; \\\n    fi\n\n# Create references directory with proper permissions for the non-root user\nRUN mkdir -p /app/references \\\n    && chown -R ${USER_UID}:${USER_GID} /app/references \\\n    && chmod 755 /app/references\n\n# Set working directory\nWORKDIR /app\n\n# Copy dependency files first for better caching\nCOPY --chown=${USER_UID}:${USER_GID} pyproject.toml uv.lock README.md ./\n\n# Switch to non-root user for package installation\nUSER ${USER_UID}:${USER_GID}\n\n# Install Python dependencies (cacheable by lockfiles)\n# Use a generic cache path that works regardless of username\nRUN --mount=type=cache,target=/tmp/uv-cache,uid=${USER_UID},gid=${USER_GID} \\\n    uv python pin ${PY_VER} \\\n    && uv sync --extra ${UV_EXTRA} --frozen --no-install-project\n\n# Copy application code\nCOPY --chown=${USER_UID}:${USER_GID} . .\n\n# Install the local package after copying source code\nRUN uv sync --extra ${UV_EXTRA} --frozen\n\n# Create common entrypoint script\nRUN printf '%s\\n' \\\n    '#!/bin/bash' \\\n    'set -euo pipefail' \\\n    '' \\\n    '# Set user info from build args' \\\n    'USER_UID='${USER_UID} \\\n    'USER_GID='${USER_GID} \\\n    '' \\\n    '# Logging function' \\\n    'log() { echo \"[$(date +\"%Y-%m-%d %H:%M:%S\")] $*\" >&2; }' \\\n    '' \\\n    '# Validate environment' \\\n    'validate_env() {' \\\n    '    if [ ! -d \"/app/checkpoints\" ]; then' \\\n    '        log \"WARNING: /app/checkpoints directory not found. Please mount your checkpoints.\"' \\\n    '    fi' \\\n    '    if [ ! -d \"/app/references\" ]; then' \\\n    '        log \"WARNING: /app/references directory not found. Please mount your references.\"' \\\n    '    else' \\\n    '        # Check if we can write to references directory' \\\n    '        if [ ! -w \"/app/references\" ]; then' \\\n    '            log \"ERROR: Cannot write to /app/references directory. Please ensure the mounted directory has proper permissions for user with UID ${USER_UID}.\"' \\\n    '            log \"You can fix this by running: sudo chown -R ${USER_UID}:${USER_GID} /path/to/your/references\"' \\\n    '            exit 1' \\\n    '        fi' \\\n    '    fi' \\\n    '}' \\\n    '' \\\n    '# Build device arguments' \\\n    'build_device_args() {' \\\n    '    if [ \"${BACKEND:-}\" = \"cpu\" ]; then' \\\n    '        echo \"--device cpu\"' \\\n    '    fi' \\\n    '}' \\\n    '' \\\n    '# Build compile arguments' \\\n    'build_compile_args() {' \\\n    '    if [ \"${1:-}\" = \"compile\" ] || [ \"${COMPILE:-}\" = \"1\" ] || [ \"${COMPILE:-}\" = \"true\" ]; then' \\\n    '        echo \"--compile\"' \\\n    '        shift' \\\n    '    fi' \\\n    '    echo \"$@\"' \\\n    '}' \\\n    '' \\\n    '# Health check function' \\\n    'health_check() {' \\\n    '    local port=${1:-7860}' \\\n    '    local endpoint=${2:-/health}' \\\n    '    curl -f http://localhost:${port}${endpoint} 2>/dev/null || exit 1' \\\n    '}' \\\n    > /app/common.sh && chmod +x /app/common.sh\n\n##############################################################\n# App stages\n##############################################################\n\n# Gradio WebUI\nFROM app-base AS webui\nENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1\n\nARG GRADIO_SERVER_NAME=\"0.0.0.0\"\nARG GRADIO_SERVER_PORT=7860\nARG LLAMA_CHECKPOINT_PATH=\"checkpoints/s2-pro\"\nARG DECODER_CHECKPOINT_PATH=\"checkpoints/s2-pro/codec.pth\"\nARG DECODER_CONFIG_NAME=\"modded_dac_vq\"\n\n\n# Expose port\nEXPOSE ${GRADIO_SERVER_PORT}\n\n# Set environment variables\nENV GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME}\nENV GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT}\nENV LLAMA_CHECKPOINT_PATH=${LLAMA_CHECKPOINT_PATH}\nENV DECODER_CHECKPOINT_PATH=${DECODER_CHECKPOINT_PATH}\nENV DECODER_CONFIG_NAME=${DECODER_CONFIG_NAME}\n\n# Create webui entrypoint\nRUN printf '%s\\n' \\\n    '#!/bin/bash' \\\n    'source /app/common.sh' \\\n    '' \\\n    'log \"Starting Fish Speech WebUI...\"' \\\n    'validate_env' \\\n    '' \\\n    'DEVICE_ARGS=$(build_device_args)' \\\n    'COMPILE_ARGS=$(build_compile_args \"$@\")' \\\n    '' \\\n    'log \"Device args: ${DEVICE_ARGS:-none}\"' \\\n    'log \"Compile args: ${COMPILE_ARGS}\"' \\\n    'log \"Server: ${GRADIO_SERVER_NAME}:${GRADIO_SERVER_PORT}\"' \\\n    '' \\\n    'exec uv run tools/run_webui.py \\' \\\n    '  --llama-checkpoint-path \"${LLAMA_CHECKPOINT_PATH}\" \\' \\\n    '  --decoder-checkpoint-path \"${DECODER_CHECKPOINT_PATH}\" \\' \\\n    '  --decoder-config-name \"${DECODER_CONFIG_NAME}\" \\' \\\n    '  ${DEVICE_ARGS} ${COMPILE_ARGS}' \\\n    > /app/start_webui.sh && chmod +x /app/start_webui.sh\n\n# Health check\nHEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \\\n    CMD curl -f http://localhost:${GRADIO_SERVER_PORT}/health || exit 1\n\nENTRYPOINT [\"/app/start_webui.sh\"]\n\n# API Server\nFROM app-base AS server\nENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1\n\nARG API_SERVER_NAME=\"0.0.0.0\"\nARG API_SERVER_PORT=8080\nARG LLAMA_CHECKPOINT_PATH=\"checkpoints/s2-pro\"\nARG DECODER_CHECKPOINT_PATH=\"checkpoints/s2-pro/codec.pth\"\nARG DECODER_CONFIG_NAME=\"modded_dac_vq\"\n\n# Expose port\nEXPOSE ${API_SERVER_PORT}\n\n# Set environment variables\nENV API_SERVER_NAME=${API_SERVER_NAME}\nENV API_SERVER_PORT=${API_SERVER_PORT}\nENV LLAMA_CHECKPOINT_PATH=${LLAMA_CHECKPOINT_PATH}\nENV DECODER_CHECKPOINT_PATH=${DECODER_CHECKPOINT_PATH}\nENV DECODER_CONFIG_NAME=${DECODER_CONFIG_NAME}\n\n# Create server entrypoint\nRUN printf '%s\\n' \\\n    '#!/bin/bash' \\\n    'source /app/common.sh' \\\n    '' \\\n    'log \"Starting Fish Speech API Server...\"' \\\n    'validate_env' \\\n    '' \\\n    'DEVICE_ARGS=$(build_device_args)' \\\n    'COMPILE_ARGS=$(build_compile_args \"$@\")' \\\n    '' \\\n    'log \"Device args: ${DEVICE_ARGS:-none}\"' \\\n    'log \"Compile args: ${COMPILE_ARGS}\"' \\\n    'log \"Server: ${API_SERVER_NAME}:${API_SERVER_PORT}\"' \\\n    '' \\\n    'exec uv run tools/api_server.py \\' \\\n    '  --listen \"${API_SERVER_NAME}:${API_SERVER_PORT}\" \\' \\\n    '  --llama-checkpoint-path \"${LLAMA_CHECKPOINT_PATH}\" \\' \\\n    '  --decoder-checkpoint-path \"${DECODER_CHECKPOINT_PATH}\" \\' \\\n    '  --decoder-config-name \"${DECODER_CONFIG_NAME}\" \\' \\\n    '  ${DEVICE_ARGS} ${COMPILE_ARGS}' \\\n    > /app/start_server.sh && chmod +x /app/start_server.sh\n\n# Health check\nHEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \\\n    CMD curl -f http://localhost:${API_SERVER_PORT}/v1/health || exit 1\n\nENTRYPOINT [\"/app/start_server.sh\"]\n\n# Development stage\nFROM app-base AS dev\nUSER root\n\n# Install development tools\nRUN --mount=type=cache,target=/var/cache/apt,sharing=locked \\\n    --mount=type=cache,target=/var/lib/apt,sharing=locked \\\n    apt-get update \\\n    && apt-get install -y --no-install-recommends \\\n        vim \\\n        htop \\\n        strace \\\n        gdb \\\n    && apt-get clean \\\n    && rm -rf /var/lib/apt/lists/*\n\nUSER ${USER_UID}:${USER_GID}\n\n# Install development dependencies\nRUN uv sync --extra ${UV_EXTRA} --dev\n\n# Default to bash for development\nENTRYPOINT [\"/bin/bash\"]\n"
  },
  {
    "path": "dockerfile.dev",
    "content": "ARG VERSION=dev\nARG BASE_IMAGE=ghcr.io/fishaudio/fish-speech:${VERSION}\n\nFROM ${BASE_IMAGE}\n\nARG TOOLS=\"               \\\n        git               \\\n        curl              \\\n        build-essential   \\\n        ffmpeg            \\\n        libsm6            \\\n        libxext6          \\\n        libjpeg-dev       \\\n        zlib1g-dev        \\\n        aria2             \\\n        zsh               \\\n        openssh-server    \\\n        sudo              \\\n        protobuf-compiler \\\n        libasound-dev     \\\n        portaudio19-dev   \\\n        libportaudio2     \\\n        libportaudiocpp0  \\\n        cmake\"\n\nRUN --mount=type=cache,target=/var/cache/apt,sharing=locked \\\n    --mount=type=cache,target=/var/lib/apt,sharing=locked \\\n    set -ex \\\n    && apt-get update \\\n    && apt-get -y install --no-install-recommends ${TOOLS}\n\n# Install oh-my-zsh so your terminal looks nice\nRUN sh -c \"$(curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)\" \"\" --unattended\n\n# Set zsh as default shell\nRUN chsh -s /usr/bin/zsh\nENV SHELL=/usr/bin/zsh\n"
  },
  {
    "path": "docs/CNAME",
    "content": "speech.fish.audio\n"
  },
  {
    "path": "docs/README.ar.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | **العربية** <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **إشعار الترخيص**\n> يتم إصدار قاعدة الأكواد هذه وأوزان النماذج المرتبطة بها تحت **[FISH AUDIO RESEARCH LICENSE](../LICENSE)**. يرجى الرجوع إلى ملف [LICENSE](../LICENSE) لمزيد من التفاصيل.\n\n\n> [!WARNING]\n> **إخلاء المسؤولية القانونية**\n> نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لقاعدة الأكواد. يرجى الرجوع إلى القوانين المحلية المتعلقة بـ DMCA والقوانين الأخرى ذات الصلة.\n\n## البداية السريعة\n\n### روابط التوثيق\n\nهذا هو التوثيق الرسمي لـ Fish Audio S2، يرجى اتباع التعليمات للبدء بسهولة.\n\n- [التثبيت](https://speech.fish.audio/ar/install/)\n- [الاستدلال عبر خط الأوامر](https://speech.fish.audio/ar/inference/)\n- [الاستدلال عبر واجهة الويب](https://speech.fish.audio/ar/inference/)\n- [استدلال الخادم](https://speech.fish.audio/ar/server/)\n- [نشر Docker](https://speech.fish.audio/ar/install/)\n\n> [!IMPORTANT]\n> **إذا كنت ترغب في استخدام خادم SGLang، فيرجى الرجوع إلى [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### دليل وكيل LLM\n\n```\nيرجى قراءة https://speech.fish.audio/ar/install/ أولاً، وتثبيت وتكوين Fish Audio S2 وفقاً للوثائق.\n```\n\n## Fish Audio S2 Pro\n**نظام تحويل النص إلى كلام (TTS) متعدد اللغات الرائد في الصناعة، والذي يعيد تعريف حدود توليد الصوت.**\n\nFish Audio S2 Pro هو أحدث طراز متعدد الوسائط تم تطويره بواسطة [Fish Audio](https://fish.audio/). تم تدريبه على أكثر من **10 ملايين ساعة** من البيانات الصوتية الهائلة، التي تغطي أكثر من **80 لغة** حول العالم. من خلال بنية **ثنائية الانحدار الذاتي (Dual-AR)** المبتكرة وتقنية توافق التعلم التعزيزي (RL)، يمكن لـ S2 Pro توليد كلام يتمتع بإحساس طبيعي وواقعي وعمق عاطفي كبير، مما يجعله رائداً في المنافسة بين الأنظمة المفتوحة والمغلقة المصدر.\n\nتكمن القوة الضاربة لـ S2 Pro في دعمه للتحكم الدقيق للغاية في النبرة والعاطفة على مستوى **ما دون الكلمة (Sub-word Level)** من خلال وسوم اللغة الطبيعية (مثل `[whisper]` و `[excited]` و `[angry]`) ، مع دعم أصلي لتوليد متحدثين متعددين وحوارات متعددة الجولات بسياق طويل جداً.\n\nتفضل بزيارة [موقع Fish Audio الرسمي](https://fish.audio/) الآن لتجربة العرض المباشر، أو اقرأ [تقريرنا الفني](https://arxiv.org/abs/2603.08823) و[مقال المدونة](https://fish.audio/blog/fish-audio-open-sources-s2/) للتعرف على المزيد.\n\n### متغيرات النموذج\n\n| النموذج | الحجم | التوفر | الوصف |\n|------|------|-------------|-------------|\n| S2-Pro | 4 مليار معلمة | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | النموذج الرائد كامل الميزات، مع أعلى جودة واستقرار |\n\nلمزيد من التفاصيل حول النماذج، يرجى مراجعة [التقرير الفني](https://arxiv.org/abs/2411.01156).\n\n## نتائج الاختبارات المرجعية (Benchmarks)\n\n| الاختبار | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (الصينية) | **0.54%** (الأفضل إجمالاً) |\n| Seed-TTS Eval — WER (الإنجليزية) | **0.99%** (الأفضل إجمالاً) |\n| Audio Turing Test (مع التعليمات) | **0.515** متوسط خلفي (Posterior mean) |\n| EmergentTTS-Eval — معدل الفوز | **81.88%** (الأعلى إجمالاً) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — الجودة | **4.51 / 5.0** |\n| متعدد اللغات (MiniMax Testset) — أفضل WER | **11** لغة من أصل **24** |\n| متعدد اللغات (MiniMax Testset) — أفضل SIM | **17** لغة من أصل **24** |\n\nفي تقييم Seed-TTS، حقق S2 أقل معدل خطأ في الكلمات (WER) بين جميع النماذج التي تم تقييمها (بما في ذلك الأنظمة مغلقة المصدر): Qwen3-TTS (0.77/1.24)، و MiniMax Speech-02 (0.99/1.90)، و Seed-TTS (1.12/2.25). وفي اختبار Audio Turing Test، سجل S2 قيمة 0.515 بزيادة قدرها 24% مقارنة بـ Seed-TTS (0.417) و 33% مقارنة بـ MiniMax-Speech (0.387). وفي EmergentTTS-Eval، تميز S2 بشكل خاص في أبعاد مثل اللغويات المصاحبة (معدل فوز 91.61%)، والجمل الاستفهامية (84.41%)، والتعقيد النحوي (83.39%).\n\n## أبرز المميزات\n\n<img src=\"./assets/totalability.png\" width=200%>\n\n### تحكم دقيق للغاية عبر اللغة الطبيعية\n\nيمنح S2 Pro الصوت \"روحاً\" لا مثيل لها. من خلال صيغة `[tag]` البسيطة، يمكنك تضمين تعليمات عاطفية بدقة في أي موضع من النص.\n- **دعم أكثر من 15,000 وسم فريد**: لا يقتصر على الإعدادات المسبقة الثابتة، بل يدعم **أوصاف النص الحر**. يمكنك تجربة `[whisper in small voice]` (همس بصوت منخفض)، أو `[professional broadcast tone]` (نبرة إذاعية احترافية)، أو `[pitch up]` (رفع طبقة الصوت).\n- **مكتبة عواطف غنية**:\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### بنية مبتكرة ثنائية الانحدار الذاتي (Dual-Autoregressive)\n\nيعتمد S2 Pro بنية Dual-AR بنظام \"رئيسي-تابع\"، تتكون من Decoder-only Transformer وترميز صوتي RVQ (10 قواميس أكواد، بمعدل إطارات يبلغ حوالي 21 هرتز):\n\n- **Slow AR (4 مليار معلمة)**: يعمل على طول المحور الزمني، ويتنبأ بقاموس الأكواد الدلالي الأساسي.\n- **Fast AR (400 مليون معلمة)**: يولد الـ 9 قواميس المتبقية في كل خطوة زمنية، لاستعادة أدق التفاصيل الصوتية ببراعة.\n\nيحقق هذا التصغير غير المتماثل أقصى درجات الدقة الصوتية مع زيادة سرعة الاستدلال بشكل كبير.\n\n### توافق التعلم التعزيزي (RL Alignment)\n\nيستخدم S2 Pro تقنية **Group Relative Policy Optimization (GRPO)** للتوافق بعد التدريب. نستخدم نفس مجموعة النماذج المستخدمة في تنظيف البيانات وتصنيفها مباشرة كنماذج مكافأة (Reward Model)، مما يحل بشكل مثالي مشكلة عدم التطابق بين توزيع بيانات ما قبل التدريب وأهداف ما بعد التدريب.\n- **إشارات مكافأة متعددة الأبعاد**: تقييم شامل للدقة الدلالية، والقدرة على اتباع التعليمات، وتسجيل التفضيل الصوتي، وتماثل نبرة الصوت، لضمان أن كل ثانية من الكلام المولد تتوافق مع الحدس البشري.\n\n### أداء استدلال تدفقي فائق (يعتمد على SGLang)\n\nنظراً لأن بنية Dual-AR تتماثل هيكلياً مع بنية LLM القياسية، فإن S2 Pro يدعم أصلاً جميع ميزات تسريع الاستدلال في SGLang، بما في ذلك الدفعات المستمرة (Continuous Batching)، و Paged KV Cache، و CUDA Graph، والتخزين المؤقت للبادئة القائم على RadixAttention.\n\n**أداء وحدة معالجة رسومات NVIDIA H200 واحدة:**\n- **عامل الوقت الحقيقي (RTF)**: 0.195\n- **تأخر الصوت الأول (TTFA)**: حوالي 100 مللي ثانية\n- **إنتاجية فائقة السرعة**: تصل إلى 3000+ وسم صوتي/ثانية مع الحفاظ على RTF < 0.5\n\n### دعم قوي للغات المتعددة\n\nيدعم S2 Pro أكثر من 80 لغة، مما يتيح تركيباً عالياً الجودة دون الحاجة إلى وحدات صوتية (phonemes) أو معالجة محددة لكل لغة:\n\n- **المستوى الأول (Tier 1)**: اليابانية (ja)، الإنجليزية (en)، الصينية (zh)\n- **المستوى الثاني (Tier 2)**: الكورية (ko)، الإسبانية (es)، البرتغالية (pt)، العربية (ar)، الروسية (ru)، الفرنسية (fr)، الألمانية (de)\n- **تغطية عالمية**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo والمزيد.\n\n### توليد متحدثين متعددين أصلي\n\n<img src=\"./assets/chattemplate.png\" width=200%>\n\nيسمح Fish Audio S2 للمستخدمين بتحميل عينة مرجعية تحتوي على متحدثين متعددين، وسيقوم النموذج بمعالجة ميزات كل متحدث عبر وسم `<|speaker:i|>`. بعد ذلك، يمكنك التحكم في أداء النموذج عبر وسم معرف المتحدث، مما يتيح لتوليد واحد أن يتضمن متحدثين متعددين. لم تعد هناك حاجة لتحميل عينة مرجعية منفصلة وتوليد صوت لكل متحدث على حدة كما كان في السابق.\n\n### توليد حوارات متعددة الجولات\n\nبفضل توسيع سياق النموذج، يمكن لنموذجنا الآن الاستفادة من المعلومات السابقة لتحسين التعبير في المحتوى المولد لاحقاً، مما يعزز من طبيعية المحتوى.\n\n### استنساخ الصوت السريع\n\nيدعم Fish Audio S2 استنساخاً دقيقاً للصوت باستخدام عينات مرجعية قصيرة (عادةً 10-30 ثانية). يلتقط النموذج نبرة الصوت وأسلوب الكلام والميول العاطفية، مما يولد أصواتاً مستنسخة واقعية ومتسقة دون الحاجة إلى ضبط دقيق إضافي.\nلاستخدام خادم SGLang، يرجى الرجوع إلى [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).\n\n---\n\n## شكر وتقدير\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## التقرير الفني\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/README.ja.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | **日本語** | [한국어](README.ko.md) | [العربية](README.ar.md) <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **ライセンス注意事項**\n> このコードベースおよび関連するモデルウェイトは **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** の下でリリースされています。詳細については [LICENSE](../LICENSE) をご参照ください。\n\n\n> [!WARNING]\n> **法的免責事項**\n> 私たちはコードベースの不法な使用について一切の責任を負いません。DMCA 及びその他の関連法律について、現地の法律をご参照ください。\n\n## クイックスタート\n\n### ドキュメント入口\n\nFish Audio S2 の公式ドキュメントです。以下からすぐに始められます。\n\n- [インストール](https://speech.fish.audio/ja/install/)\n- [コマンドライン推論](https://speech.fish.audio/ja/inference/)\n- [WebUI 推論](https://speech.fish.audio/ja/inference/)\n- [サーバー推論](https://speech.fish.audio/ja/server/)\n- [Docker デプロイ](https://speech.fish.audio/ja/install/)\n\n> [!IMPORTANT]\n> **SGLang サーバーについては [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) を参照してください。**\n\n### LLM Agent 指南\n\n```\nhttps://speech.fish.audio/ja/install/ の手順に従って、Fish Audio S2 をインストール・設定してください。\n```\n\n## Fish Audio S2 Pro\n**業界最先端の多言語テキスト読み上げ (TTS) システム。音声生成の限界を再定義します。**\n\nFish Audio S2 Pro は [Fish Audio](https://fish.audio/) が開発した最高峰のマルチモーダルモデルです。世界 **80 言語以上**、**1,000 万時間** を超える膨大な音声データで学習されています。革新的な **二重自己回帰 (Dual-AR)** アーキテクチャと強化学習 (RL) アライメント技術を組み合わせることで、極めて自然でリアル、かつ感情豊かな音声を生成し、オープンソースおよびクローズドソースの双方でリーダーシップを発揮しています。\n\nS2 Pro の最大の特徴は、自然言語タグ（例：`[whisper]`、`[excited]`、`[angry]`）による韻律や感情の **サブワードレベル (Sub-word Level)** での極めて細やかなインライン制御が可能である点です。また、マルチスピーカー生成や長文コンテキストのマルチターン対話生成にもネイティブ対応しています。\n\n今すぐ [Fish Audio 公式サイト](https://fish.audio/) でプレイグラウンドを体験するか、[技術レポート](https://arxiv.org/abs/2603.08823) や [ブログ記事](https://fish.audio/blog/fish-audio-open-sources-s2/) を読んで詳細を確認してください。\n\n### モデルバリアント\n\n| モデル | サイズ | 利用可能性 | 説明 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B パラメータ | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 品質と安定性を最大化した、フル機能のフラッグシップモデル |\n\nモデルの詳細は[技術レポート](https://arxiv.org/abs/2411.01156)をご参照ください。\n\n## ベンチマーク結果\n\n| ベンチマーク | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER（中国語） | **0.54%**（全体最良） |\n| Seed-TTS Eval — WER（英語） | **0.99%**（全体最良） |\n| Audio Turing Test（指示あり） | **0.515** 事後平均値 |\n| EmergentTTS-Eval — 勝率 | **81.88%**（全体最高） |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 品質 | **4.51 / 5.0** |\n| 多言語（MiniMax Testset）— 最良 WER | **24 言語中 11 言語** |\n| 多言語（MiniMax Testset）— 最良 SIM | **24 言語中 17 言語** |\n\nSeed-TTS Eval では、S2 はクローズドソースを含む全評価モデルの中で最小 WER を達成しました：Qwen3-TTS（0.77/1.24）、MiniMax Speech-02（0.99/1.90）、Seed-TTS（1.12/2.25）。Audio Turing Test では 0.515 を記録し、Seed-TTS（0.417）比で 24%、MiniMax-Speech（0.387）比で 33% 上回りました。EmergentTTS-Eval では、副言語情報（91.61%）、疑問文（84.41%）、統語的複雑性（83.39%）で特に高い成績を示しています。\n\n## ハイライト\n\n<img src=\"./assets/totalability.png\" width=200%>\n\n### 自然言語による細粒度インライン制御\n\nS2 Pro は音声にこれまでにない「魂」を宿らせます。シンプルな `[tag]` 構文を使用して、テキスト内の任意の場所に感情の指示を正確に埋め込むことができます。\n- **1万5,000以上のユニークタグに対応**：固定のプリセットに限定されず、**自由形式のテキスト記述** をサポートします。`[whisper in small voice]` (ささやき声で), `[professional broadcast tone]` (プロのナレーション風), `[pitch up]` (ピッチを上げる) などを試してみてください。\n- **豊富な感情ライブラリ**:\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### 革新的な二重自己回帰 (Dual-Autoregressive) アーキテクチャ\n\nS2 Pro は、Decoder-only Transformer と RVQ オーディオコーデック（10 コードブック、約 21 Hz）で構成されるマスター・スレーブ型の Dual-AR アーキテクチャを採用しています：\n\n- **Slow AR (4B パラメータ)**: 時間軸方向に動作し、核となるセマンティックコードブックを予測。\n- **Fast AR (400M パラメータ)**: 各時間ステップで残り 9 個の残差コードブックを生成し、極めて繊細な音響ディテールを復元。\n\nこの非対称設計により、究極のオーディオ忠実度を維持しながら、推論速度を大幅に向上させています。\n\n### 強化学習 (RL) アライメント\n\nS2 Pro は、事後学習アライメントに **Group Relative Policy Optimization (GRPO)** 技術を採用しています。データのクリーニングとアノテーションに使用したモデルセットをそのまま報酬モデル (Reward Model) として使用することで、事前学習データの分布と事後学習の目標との間のミスマッチを完璧に解決しました。\n- **多次元の報酬信号**: 意味の正確性、指示追従性、音響的な好み、音色の類似性を総合的に評価し、生成される一秒一秒の音声が人間の直感に沿うようにしています。\n\n### SGLang による究極のストリーミング推論性能\n\nDual-AR アーキテクチャは標準的な LLM 構造と同型であるため、S2 Pro は SGLang のすべての推論加速機能をネイティブにサポートしています。これには、Continuous Batching、Paged KV Cache、CUDA Graph、RadixAttention ベースの Prefix Caching が含まれます。\n\n**NVIDIA H200 GPU 1枚でのパフォーマンス表現:**\n- **リアルタイム係数 (RTF)**: 0.195\n- **初回音声出力までの時間 (TTFA)**: 約 100 ms\n- **極速スループット**: RTF < 0.5 を維持しつつ 3,000+ acoustic tokens/s\n\n### 強力な多言語サポート\n\nS2 Pro は 80 以上の言語をサポートしており、音素や特定の言語に対する前処理なしで高品質な合成を実現します：\n\n- **第1層 (Tier 1)**: 日本語 (ja), 英語 (en), 中国語 (zh)\n- **第2層 (Tier 2)**: 韓国語 (ko), スペイン語 (es), ポルトガル語 (pt), アラビア語 (ar), ロシア語 (ru), フランス語 (fr), ドイツ語 (de)\n- **グローバルカバレッジ**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, e!t, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo など。\n\n### ネイティブなマルチスピーカー生成\n\n<img src=\"./assets/chattemplate.png\" width=200%>\n\nFish Audio S2 では、複数のスピーカーを含む参照オーディオをアップロードでき、モデルは `<|speaker:i|>` トークンを介して各スピーカーの特徴を処理します。スピーカー ID トークンを使用してモデルの出力を制御することで、1回の生成に複数のスピーカーを混在させることが可能です。個別のスピーカーごとに参照オーディオをアップロードし直す手間はもう不要です。\n\n### マルチターン対話生成\n\nコンテキストの拡張により、以前のターンの情報を利用して後続の生成内容の表現力を高めることができ、対話としての自然さが大幅に向上しました。\n\n### 高速音声クローニング\n\nFish Audio S2 は、短い参照サンプル（通常 10〜30 秒）を使用した正確な音声クローニングをサポートしています。モデルは音色、話し方、感情を捉え、追加の微調整なしでリアルで一貫したクローン音声を生成します。\nSGLang サーバーの利用については、[SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) を参照してください。\n\n---\n\n## 謝辞\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 技術レポート\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/README.ko.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n[English](../README.md) | [简体中文](README.zh.md) | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | **한국어** | [العربية](README.ar.md) <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **라이선스 고지**\n> 이 코드베이스 및 관련 모델 가중치는 **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** 에 따라 배포됩니다. 자세한 내용은 [LICENSE](../LICENSE)를 참조하십시오.\n\n\n> [!WARNING]\n> **법적 면책 조항**\n> 당사는 코드베이스의 불법적인 사용에 대해 어떠한 책임도 지지 않습니다. 해당 지역의 DMCA 및 기타 관련 법률을 참조하십시오.\n\n## 빠른 시작\n\n### 문서 입구\n\nFish Audio S2의 공식 문서입니다. 지침에 따라 쉽게 시작하십시오.\n\n- [설치](https://speech.fish.audio/ko/install/)\n- [명령줄 추론](https://speech.fish.audio/ko/inference/)\n- [WebUI 추론](https://speech.fish.audio/ko/inference/)\n- [서버 추론](https://speech.fish.audio/ko/server/)\n- [Docker 배포](https://speech.fish.audio/ko/install/)\n\n> [!IMPORTANT]\n> **SGLang 서버를 사용하려면 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md)를 참조하십시오.**\n\n### LLM Agent 가이드\n\n```\n먼저 https://speech.fish.audio/ko/install/ 을 읽고 문서에 따라 Fish Audio S2를 설치 및 구성하십시오.\n```\n\n## Fish Audio S2 Pro\n**음성 생성의 경계를 재정의하는 업계 최고의 다국어 텍스트 음성 변환(TTS) 시스템.**\n\nFish Audio S2 Pro는 [Fish Audio](https://fish.audio/)에서 개발한 최첨단 멀티모달 모델입니다. 전 세계 **80개 이상의 언어**를 아우르는 **1,000만 시간** 이상의 방대한 오디오 데이터로 학습되었습니다. 혁신적인 **이중 자기회귀(Dual-AR)** 아키텍처와 강화 학습(RL) 정렬 기술을 통해 S2 Pro는 극도로 자연스럽고 사실적이며 감정이 풍부한 음성을 생성하며, 오픈 소스와 클ローズ드 소스 경쟁 모두에서 선두를 달리고 있습니다.\n\nS2 Pro의 핵심 강점은 자연어 태그(예: `[whisper]`, `[excited]`, `[angry]`)를 통해 운율과 감정을 **하위 단어 수준(Sub-word Level)**에서 매우 세밀하게 인라인 제어할 수 있다는 점입니다. 또한 다중 화자 생성 및 긴 컨텍스트의 다중 턴 대화 생성을 기본적으로 지원합니다.\n\n지금 바로 [Fish Audio 공식 웹사이트](https://fish.audio/)에서 온라인 데모를 체험하거나, [기술 보고서](https://arxiv.org/abs/2603.08823) 및 [블로그 게시물](https://fish.audio/blog/fish-audio-open-sources-s2/)을 통해 자세히 알아보십시오.\n\n### 모델 변체\n\n| 모델 | 크기 | 가용성 | 설명 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B 파라미터 | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 최고의 품질과 안정성을 갖춘 모든 기능을 갖춘 플래그십 모델 |\n\n모델에 대한 자세한 내용은 [기술 보고서](https://arxiv.org/abs/2411.01156)를 참조하십시오.\n\n## 벤치마크 결과\n\n| 벤치마크 | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER(중국어) | **0.54%** (전체 최고) |\n| Seed-TTS Eval — WER(영어) | **0.99%** (전체 최고) |\n| Audio Turing Test (지침 포함) | **0.515** 후험 평균 |\n| EmergentTTS-Eval — 승률 | **81.88%** (전체 최고) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 품질 | **4.51 / 5.0** |\n| 다국어 (MiniMax Testset) — 최고 WER | **24개 언어 중 11개** |\n| 다국어 (MiniMax Testset) — 최고 SIM | **24개 언어 중 17개** |\n\nSeed-TTS Eval에서 S2는 클ローズ드 소스 시스템을 포함한 모든 평가 모델 중 가장 낮은 WER을 달성했습니다: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). Audio Turing Test에서 S2의 0.515는 Seed-TTS (0.417) 대비 24%, MiniMax-Speech (0.387) 대비 33% 향상된 수치입니다. EmergentTTS-Eval에서 S2는 부차 언어학(91.61% 승률), 의문문(84.41%), 구문 복잡성(83.39%) 등의 측면에서 특히 두드러진 성과를 보였습니다.\n\n## 하이라이트\n\n<img src=\"./assets/totalability.png\" width=200%>\n\n### 자연어를 통한 초미세 인라인 제어\n\nS2 Pro는 음성에 전례 없는 \"영혼\"을 부여합니다. 간단한 `[tag]` 구문을 사용하여 텍스트의 어느 위치에나 감정 지침을 정확하게 삽입할 수 있습니다.\n- **15,000개 이상의 고유 태그 지원**: 고정된 사전 설정에 국한되지 않고 **자유 형식의 텍스트 설명**을 지원합니다. `[whisper in small voice]` (작은 목소리로 속삭임), `[professional broadcast tone]` (전문 방송 톤), `[pitch up]` (음높이 높임) 등을 시도해 보십시오.\n- **풍부한 감정 라이브러리**:\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### 혁신적인 이중 자기회귀 (Dual-Autoregressive) 아키텍처\n\nS2 Pro는 Decoder-only Transformer와 RVQ 오디오 코덱(10개 코드북, 약 21Hz 프레임 속도)으로 구성된 마스터-슬레이브 방식의 Dual-AR 아키텍처를 채택했습니다.\n\n- **Slow AR (4B 파라미터)**: 시간 축을 따라 작동하며 핵심 의미 코드북을 예측합니다.\n- **Fast AR (400M 파라미터)**: 각 타임스텝에서 나머지 9개의 잔차 코드북을 생성하여 극도로 정교한 음향 세부 사항을 복원합니다.\n\n이러한 비대칭 설계는 오디오의 최고 충실도를 보장하는 동시에 추론 속도를 대폭 향상시킵니다.\n\n### 강화 학습 (RL) 정렬\n\nS2 Pro는 사후 학습 정렬을 위해 **Group Relative Policy Optimization (GRPO)** 기술을 채택했습니다. 데이터 정제 및 주석 처리에 사용된 것과 동일한 모델 세트를 보상 모델(Reward Model)로 직접 사용함으로써 사전 학습 데이터 분포와 사후 학습 목표 간의 불일치 문제를 완벽하게 해결했습니다.\n- **다차원 보상 신호**: 의미 체계의 정확성, 지침 준수 능력, 음향 선호도 점수 및 음색 유사성을 종합적으로 평가하여 생성된 음성의 매초가 인간의 직관에 부합하도록 보장합니다.\n\n### SGLang 기반의 극한 스트리밍 추론 성능\n\nDual-AR 아키텍처는 표준 LLM 구조와 동형이므로 S2 Pro는 Continuous Batching, Paged KV Cache, CUDA Graph 및 RadixAttention 기반 Prefix Caching을 포함한 SGLang의 모든 추론 가속 기능을 기본적으로 지원합니다.\n\n**단일 NVIDIA H200 GPU 성능 지표:**\n- **실시간 계수 (RTF)**: 0.195\n- **첫 음성 지연 (TTFA)**: 약 100 ms\n- **초고속 처리량**: RTF < 0.5 유지 시 처리량 3,000+ acoustic tokens/s 달성\n\n### 강력한 다국어 지원\n\nS2 Pro는 음소나 특정 언어 처리가 필요 없는 고품질 합성을 80개 이상의 언어에서 지원합니다.\n\n- **1계층 (Tier 1)**: 일본어 (ja), 영어 (en), 중국어 (zh)\n- **2계층 (Tier 2)**: 한국어 (ko), 스페인어 (es), 포르투갈어 (pt), 아랍어 (ar), 러시아어 (ru), 프랑스어 (fr), 독일어 (de)\n- **글로벌 커버리지**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo 등.\n\n### 네이티브 다중 화자 생성\n\n<img src=\"./assets/chattemplate.png\" width=200%>\n\nFish Audio S2를 사용하면 사용자가 여러 화자가 포함된 참조 오디오를 업로드할 수 있으며, 모델은 `<|speaker:i|>` 토큰을 통해 각 화자의 특징을 처리합니다. 이후 화자 ID 토큰을 사용하여 모델의 표현을 제어함으로써 한 번의 생성에 여러 화자를 포함할 수 있습니다. 더 이상 화자마다 별도의 참조 오디오를 업로드하고 음성을 생성할 필요가 없습니다.\n\n### 다중 턴 대화 생성\n\n모델 컨텍스트 확장에 힘입어 이제 이전 정보의 도움을 받아 후속 생성 내용의 표현력을 높이고 콘텐츠의 자연스러움을 향상시킬 수 있습니다.\n\n### 고속 음성 복제\n\nFish Audio S2는 짧은 참조 샘플(보통 10-30초)을 사용한 정확한 음성 복제를 지원합니다. 모델은 음색, 말하기 스타일 및 감정적 경향을 포착하여 추가적인 미세 조정 없이도 사실적이고 일관된 복제 음성을 생성합니다.\nSGLang 서버 사용에 대해서는 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md)를 참조하십시오.\n\n---\n\n## 감사의 말\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 기술 보고서\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/README.pt-BR.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n[English](../README.md) | [简体中文](README.zh.md) | **Portuguese** | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **Aviso de Licença**\n> Este repositório de código e seus pesos de modelo associados são lançados sob a **[FISH AUDIO RESEARCH LICENSE](../LICENSE)**. Consulte [LICENSE](../LICENSE) para obter mais detalhes.\n\n\n> [!WARNING]\n> **Aviso Legal**\n> Não nos responsabilizamos por qualquer uso ilegal deste repositório. Consulte as leis locais sobre DMCA e outras regulamentações relevantes.\n\n## Início Rápido\n\n### Links da Documentação\n\nEsta é a documentação oficial do Fish Audio S2, siga as instruções para começar facilmente.\n\n- [Instalação](https://speech.fish.audio/install/)\n- [Inferência por Linha de Comando](https://speech.fish.audio/inference/)\n- [Inferência por WebUI](https://speech.fish.audio/inference/)\n- [Inferência por Servidor](https://speech.fish.audio/server/)\n- [Implantação Docker](https://speech.fish.audio/install/)\n\n> [!IMPORTANT]\n> **Caso deseje utilizar o SGLang Server, consulte o [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### Guia para Agentes de LLM\n\n```\nLeia primeiro https://speech.fish.audio/install/ e siga a documentação para instalar e configurar o Fish Audio S2.\n```\n\n## Fish Audio S2 Pro\n**O sistema de conversão de texto em fala (TTS) multilíngue líder do setor, redefinindo as fronteiras da geração de voz.**\n\nFish Audio S2 Pro é o modelo multimodal mais avançado desenvolvido pela [Fish Audio](https://fish.audio/). Treinado em mais de **10 milhões de horas** de dados de áudio massivos, cobrindo mais de **80 idiomas** globais. Através de uma arquitetura inovadora de **Dual-Autoregressive (Dual-AR)** e tecnologia de alinhamento por aprendizado por reforço (RL), o S2 Pro é capaz de gerar fala com um senso de naturalidade, realismo e riqueza emocional extremos, liderando tanto em competições de código aberto quanto proprietário.\n\nO grande diferencial do S2 Pro reside em seu suporte para controle inline de granularidade ultra-fina de prosódia e emoção ao nível de **sub-palavra (Sub-word Level)** via tags de linguagem natural (como `[whisper]`, `[excited]`, `[angry]`), além de suporte nativo para múltiplos falantes e geração de diálogos de múltiplos turnos com contexto ultra-longo.\n\nVisite agora o [site oficial da Fish Audio](https://fish.audio/) para experimentar a demonstração online, ou leia nosso [relatório técnico](https://arxiv.org/abs/2603.08823) e [artigo no blog](https://fish.audio/blog/fish-audio-open-sources-s2/) para saber mais.\n\n### Variantes de Modelo\n\n| Modelo | Tamanho | Disponibilidade | Descrição |\n|------|------|-------------|-------------|\n| S2-Pro | 4B parâmetros | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Modelo flagship completo, com máxima qualidade e estabilidade |\n\nPara mais detalhes sobre os modelos, consulte o [relatório técnico](https://arxiv.org/abs/2411.01156).\n\n## Resultados de Benchmark\n\n| Benchmark | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (Chinês) | **0.54%** (Melhor geral) |\n| Seed-TTS Eval — WER (Inglês) | **0.99%** (Melhor geral) |\n| Audio Turing Test (Com instrução) | **0.515** Média posterior |\n| EmergentTTS-Eval — Taxa de Vitória | **81.88%** (Maior geral) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — Qualidade | **4.51 / 5.0** |\n| Multilíngue (MiniMax Testset) — Melhor WER | **11 de 24** idiomas |\n| Multilíngue (MiniMax Testset) — Melhor SIM | **17 de 24** idiomas |\n\nNo Seed-TTS Eval, o S2 alcançou o menor WER entre todos os modelos avaliados (incluindo sistemas proprietários): Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). No Audio Turing Test, o valor de 0.515 do S2 representa um aumento de 24% em relação ao Seed-TTS (0.417) e 33% em relação ao MiniMax-Speech (0.387). No EmergentTTS-Eval, o S2 destacou-se especialmente em dimensões como paralinguística (taxa de vitória de 91.61%), frases interrogativas (84.41%) e complexidade sintática (83.39%).\n\n## Destaques\n\n<img src=\"./assets/totalability.png\" width=200%>\n\n### Controle Inline de Granularidade Ultra-Fina via Linguagem Natural\n\nS2 Pro confere à voz uma \"espiritualidade\" sem precedentes. Através de uma sintaxe simples de `[tag]`, você pode inserir instruções emocionais precisamente em qualquer posição do texto.\n- **Suporte para mais de 15.000 tags únicas**: Não limitado a predefinições fixas, suporta **descrições textuais de formato livre**. Você pode tentar `[whisper in small voice]` (sussurrando), `[professional broadcast tone]` (tom de locução profissional) ou `[pitch up]` (aumentar o tom).\n- **Rica biblioteca de emoções**:\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### Arquitetura Inovadora Dual-Autoregressive (Dual-AR)\n\nS2 Pro adota uma arquitetura Dual-AR mestre-escravo, consistindo de um Decoder-only Transformer e um codec de áudio RVQ (10 codebooks, cerca de 21 Hz de taxa de frames):\n\n- **Slow AR (4B parâmetros)**: Atua ao longo do eixo temporal, prevendo o codebook semântico central.\n- **Fast AR (400M parâmetros)**: Gera os 9 codebooks residuais restantes em cada passo de tempo, restaurando detalhes acústicos extremos com delicadeza.\n\nEste design assimétrico garante fidelidade extrema ao áudio enquanto aumenta significativamente a velocidade de inferência.\n\n### Alinhamento por Aprendizado por Reforço (RL Alignment)\n\nS2 Pro utiliza a tecnologia **Group Relative Policy Optimization (GRPO)** para o alinhamento pós-treinamento. Utilizamos o mesmo conjunto de modelos para limpeza e anotação de dados diretamente como modelos de recompensa (Reward Model), resolvendo perfeitamente o problema de descasamento entre a distribuição dos dados de pré-treinamento e os objetivos de pós-treinamento.\n- **Sinais de recompensa multidimensionais**: Avalia de forma abrangente a precisão semântica, a capacidade de seguir instruções, a pontuação de preferência acústica e a similaridade de timbre, garantindo que cada segundo de fala gerada esteja alinhado com a intuição humana.\n\n### Desempenho de Inferência de Streaming Extremo (Baseado em SGLang)\n\nComo a arquitetura Dual-AR é estruturalmente isomorfa à estrutura padrão de LLMs, o S2 Pro suporta nativamente todos os recursos de aceleração de inferência do SGLang, incluindo loteamento contínuo (Continuous Batching), Paged KV Cache, CUDA Graph e cache de prefixo baseado em RadixAttention.\n\n**Desempenho em uma única GPU NVIDIA H200:**\n- **Fator em Tempo Real (RTF)**: 0.195\n- **Latência do Primeiro Áudio (TTFA)**: aprox. 100 ms\n- **Taxa de Transferência Ultrarrápida**: Alcance de 3.000+ acoustic tokens/s mantendo RTF < 0.5\n\n### Poderoso Suporte Multilíngue\n\nS2 Pro suporta mais de 80 idiomas, possibilitando síntese de alta qualidade sem a necessidade de fonemas ou processamento específico por idioma:\n\n- **Tier 1**: Japonês (ja), Inglês (en), Chinês (zh)\n- **Tier 2**: Coreano (ko), Espanhol (es), Português (pt), Árabe (ar), Russo (ru), Francês (fr), Alemão (de)\n- **Cobertura Global**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo, etc.\n\n### Geração Nativa Multi-falante\n\n<img src=\"./assets/chattemplate.png\" width=200%>\n\nO Fish Audio S2 permite que os usuários enviem áudio de referência contendo múltiplos falantes, e o modelo processará as características de cada falante via o token `<|speaker:i|>`. Em seguida, você pode controlar o desempenho do modelo através do token de ID do falante, permitindo incluir múltiplos falantes em uma única geração. Não é mais necessário enviar áudios de referência separadamente para cada falante.\n\n### Geração de Diálogos Multiturnos\n\nGraças à expansão do contexto do modelo, nosso modelo agora pode aproveitar as informações prévias para aumentar a expressividade dos conteúdos gerados subsequentemente, elevando assim a naturalidade dos diálogos.\n\n### Clonagem de Voz Rápida\n\nO Fish Audio S2 suporta clonagem de voz precisa usando curtas amostras de referência (normalmente 10-30 segundos). O modelo captura o timbre, o estilo de fala e as tendências emocionais, gerando vozes clonadas realistas e consistentes sem necessidade de ajustes finos adicionais.\nCaso deseje utilizar o SGLang Server, consulte o [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).\n\n---\n\n## Agradecimentos\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## Relatório Técnico\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang racing Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/README.zh.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n[English](../README.md) | **简体中文** | [Portuguese](README.pt-BR.md) | [日本語](README.ja.md) | [한국어](README.ko.md) | [العربية](README.ar.md) <br>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n<br>\n</div>\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n> [!IMPORTANT]\n> **许可证声明**\n> 此代码库及其相关的模型权重均在 **[FISH AUDIO RESEARCH LICENSE](../LICENSE)** 下发布。更多详情请参考 [LICENSE](../LICENSE)。\n\n\n> [!WARNING]\n> **法律免责声明**\n> 我们不对代码库的任何非法使用承担责任。请参考您当地关于 DMCA 和其他相关法律的法规。\n\n## 快速开始\n\n### 文档入口\n\n这里是 Fish Audio S2 的官方文档，请按照说明轻松入门。\n\n- [安装](https://speech.fish.audio/zh/install/)\n- [命令行推理](https://speech.fish.audio/zh/inference/)\n- [WebUI 推理](https://speech.fish.audio/zh/inference/)\n- [服务端推理](https://speech.fish.audio/zh/server/)\n- [Docker 部署](https://speech.fish.audio/zh/install/)\n\n> [!IMPORTANT]\n> **如需使用 SGLang Server，请参考 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md)。**\n\n### LLM Agent 指南\n\n```\n请先阅读 https://speech.fish.audio/zh/install/ ，并按文档安装和配置 Fish Audio S2。\n```\n\n## Fish Audio S2 Pro\n**行业顶尖的多语言文本转语音 (TTS) 系统，重新定义声音生成的边界。**\n\nFish Audio S2 Pro 是 [Fish Audio](https://fish.audio/) 开发的最先进的多模态模型。S2 Pro 训练自超过 **1000 万小时** 的海量音频数据，覆盖全球 **80 多种语言**。通过创新的 **双自回归 (Dual-AR)** 架构与强化学习 (RL) 对齐技术，S2 Pro 能生成极具自然感、真实感且情感饱满的语音，在开源与闭源竞争中均处于领先地位。\n\nS2 Pro 的杀手锏在于支持通过自然语言标签（如 `[whisper]`、`[excited]`、`[angry]`）对韵律与情绪进行 **亚词级（Sub-word Level）** 的极细粒度行内控制，同时原生支持多说话人与超长上下文的多轮对话生成。\n\n立即访问 [Fish Audio 官网](https://fish.audio/) 体验在线演示，或阅读我们的[技术报告](https://arxiv.org/abs/2603.08823)与[博客文章](https://fish.audio/blog/fish-audio-open-sources-s2/)深入了解。\n\n### 模型变体\n\n| 模型 | 大小 | 可用性 | 描述 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B 参数 | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 功能齐全的旗舰模型，具有最高质量和稳定性 |\n\n有关模型的更多详情，请参见[技术报告](https://arxiv.org/abs/2411.01156)。\n\n## 基准测试结果\n\n| 基准 | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER（中文） | **0.54%**（总体最佳） |\n| Seed-TTS Eval — WER（英文） | **0.99%**（总体最佳） |\n| Audio Turing Test（含指令） | **0.515** 后验均值 |\n| EmergentTTS-Eval — 胜率 | **81.88%**（总体最高） |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 质量 | **4.51 / 5.0** |\n| 多语言（MiniMax Testset）— 最佳 WER | **24** 种语言中的 **11** 种 |\n| 多语言（MiniMax Testset）— 最佳 SIM | **24** 种语言中的 **17** 种 |\n\n在 Seed-TTS Eval 上，S2 在所有已评估模型（包括闭源系统）中实现了最低 WER：Qwen3-TTS（0.77/1.24）、MiniMax Speech-02（0.99/1.90）、Seed-TTS（1.12/2.25）。在 Audio Turing Test 上，S2 的 0.515 相比 Seed-TTS（0.417）提升 24%，相比 MiniMax-Speech（0.387）提升 33%。在 EmergentTTS-Eval 中，S2 在副语言学（91.61% 胜率）、疑问句（84.41%）和句法复杂度（83.39%）等维度表现尤为突出。\n\n## 亮点\n\n<img src=\"./assets/totalability.png\" width=200%>\n\n### 通过自然语言进行极细粒度行内控制\n\nS2 Pro 赋予了语音前所未有的“灵性”。通过简单的 `[tag]` 语法，你可以在文本的任何位置精准嵌入情感指令。\n- **15,000+ 独特标签支持**：不局限于固定的预设，支持 **自由格式的文本描述**。你可以尝试 `[whisper in small voice]` (低声耳语), `[professional broadcast tone]` (专业播音腔), 或 `[pitch up]` (提高音调)。\n- **丰富的情绪库**：\n  `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]`\n\n### 创新的双自回归 (Dual-Autoregressive) 架构\n\nS2 Pro 采用了主从式 Dual-AR 架构，由 Decoder-only Transformer 与 RVQ 音频编解码器（10 个码本，约 21 Hz 帧率）组成：\n\n- **Slow AR (4B 参数)**：沿时间轴工作，预测核心的语义码本。\n- **Fast AR (400M 参数)**：在每个时间步生成剩余 9 个残差码本，细腻还原极致的音频细节。\n\n这种非对称设计在保证音频极致保真度的同时，大幅提升了推理速度。\n\n### 强化学习对齐 (RL Alignment)\n\nS2 Pro 采用了 **Group Relative Policy Optimization (GRPO)** 技术进行后训练对齐。我们将用于数据清洗与标注的同一套模型直接作为奖励模型 (Reward Model)，完美解决了预训练数据分布与后训练目标之间的不匹配问题。\n- **多维奖励信号**：综合评估语义准确性、指令遵循能力、声学偏好评分以及音色相似度，确保生成的每一秒语音都符合人类直觉。\n\n### 极致的流式推理性能 (基于 SGLang)\n\n由于 Dual-AR 架构与标准 LLM 结构同构，S2 Pro 原生支持 SGLang 的所有推理加速特性，包括连续批处理 (Continuous Batching)、分页 KV Cache、CUDA Graph 与基于 RadixAttention 的前缀缓存。\n\n**单张 NVIDIA H200 GPU 性能表现：**\n- **实时因子 (RTF)**：0.195\n- **首音延迟 (TTFA)**：约 100 ms\n- **极速吞吐**：在保持 RTF < 0.5 时，吞吐量达到 3,000+ acoustic tokens/s\n\n### 强大的多语言支持\n\nS2 Pro 支持 80 多种语言，无需音素或特定语言的处理即可实现高质量合成：\n\n- **第一梯队 (Tier 1)**：日语 (ja), 英语 (en), 中文 (zh)\n- **第二梯队 (Tier 2)**：韩语 (ko), 西班牙语 (es), 葡萄牙语 (pt), 阿拉伯语 (ar), 俄语 (ru), 法语 (fr), 德语 (de)\n- **全球覆盖**：sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo 等。\n\n### 原生多说话人生成\n\n<img src=\"./assets/chattemplate.png\" width=200%>\n\nFish Audio S2 允许用户上传包含多个说话人的参考音频，模型将通过 `<|speaker:i|>` 令牌处理每个说话人的特征。之后您可以通过说话人 ID 令牌控制模型的表现，从而实现一次生成中包含多个说话人。再也不需要像以前那样针对每个说话人都单独上传参考音频与生成语音了。\n\n### 多轮对话生成\n\n得益于模型上下文的扩展，我们的模型现在可以借助上文的信息提高后续生成内容的表现力，从而提升内容的自然度。\n\n### 快速语音克隆\n\nFish Audio S2 支持使用短参考样本（通常为 10-30 秒）进行准确的语音克隆。模型可以捕捉音色、说话风格和情感倾向，无需额外微调即可生成逼真且一致的克隆语音。\n如需使用 SGLang Server，请参考 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) 。\n\n---\n\n## 致谢\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 技术报告\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/ar/finetune.md",
    "content": "# الضبط الدقيق (Fine-tuning)\n\nمن الواضح أنك عندما فتحت هذه الصفحة، لم تكن راضيًا عن أداء النموذج المدرب مسبقًا في وضع zero-shot. أنت ترغب في إجراء ضبط دقيق لنموذج لتحسين أدائه على مجموعة البيانات الخاصة بك.\n\nفي الإصدار الحالي، ما عليك سوى إجراء الضبط الدقيق لجزء 'LLAMA'.\n\n## الضبط الدقيق لـ LLAMA\n### 1. إعداد مجموعة البيانات\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\nتحتاج إلى تحويل مجموعة البيانات الخاصة بك إلى التنسيق أعلاه ووضعها تحت مجلد `data`. يمكن أن يكون للملف الصوتي الامتدادات `.mp3`، `.wav`، أو `.flac`، ويجب أن يكون لملف التعليقات التوضيحية الامتداد `.lab`.\n\n!!! info \"تنسيق مجموعة البيانات\"\n    يحتاج ملف التعليقات التوضيحية `.lab` فقط إلى احتواء النص المكتوب للمقطع الصوتي، دون الحاجة إلى تنسيق خاص. على سبيل المثال، إذا كان محتوى `hi.mp3` هو \"مرحبًا، وداعًا\"، فسيحتوي ملف `hi.lab` على سطر واحد من النص: \"مرحبًا، وداعًا\".\n\n!!! warning \"تحذير\"\n    يوصى بتطبيق تسوية جهارة الصوت (loudness normalization) على مجموعة البيانات. يمكنك استخدام [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) للقيام بذلك.\n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n### 2. الاستخراج الدفعي للرموز الدلالية (semantic tokens)\n\nتأكد من أنك قمت بتنزيل أوزان VQGAN. إذا لم تكن قد فعلت، قم بتشغيل الأمر التالي:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nيمكنك بعد ذلك تشغيل الأمر التالي لاستخراج الرموز الدلالية:\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n```\n\n!!! note \"ملاحظة\"\n    يمكنك ضبط `--num-workers` و `--batch-size` لزيادة سرعة الاستخراج، ولكن يرجى التأكد من عدم تجاوز حد ذاكرة وحدة معالجة الرسومات (GPU) الخاصة بك.\n\nسيقوم هذا الأمر بإنشاء ملفات `.npy` في مجلد `data`، كما هو موضح أدناه:\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy\n```\n\n### 3. حزم مجموعة البيانات في protobuf\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\nبعد انتهاء تنفيذ الأمر، يجب أن ترى ملف `protos` في مجلد `data`.\n\n### 4. أخيرًا، الضبط الدقيق باستخدام LoRA\n\nبالمثل، تأكد من أنك قمت بتنزيل أوزان `LLAMA`. إذا لم تكن قد فعلت، قم بتشغيل الأمر التالي:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nأخيرًا، يمكنك بدء الضبط الدقيق عن طريق تشغيل الأمر التالي:\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note \"ملاحظة\"\n    يمكنك تعديل معلمات التدريب مثل `batch_size`، `gradient_accumulation_steps`، وما إلى ذلك لتناسب ذاكرة وحدة معالجة الرسومات الخاصة بك عن طريق تعديل `fish_speech/configs/text2semantic_finetune.yaml`.\n\n!!! note \"ملاحظة\"\n    لمستخدمي Windows، يمكنك استخدام `trainer.strategy.process_group_backend=gloo` لتجنب مشكلات `nccl`.\n\nبعد اكتمال التدريب، يمكنك الرجوع إلى قسم [الاستدلال (inference)](inference.md) لاختبار نموذجك.\n\n!!! info \"معلومات\"\n    بشكل افتراضي، سيتعلم النموذج فقط أنماط كلام المتحدث وليس جرس الصوت (timbre). لا تزال بحاجة إلى استخدام التلقينات (prompts) لضمان استقرار جرس الصوت.\n    إذا كنت ترغب في تعلم جرس الصوت، يمكنك زيادة عدد خطوات التدريب، ولكن هذا قد يؤدي إلى الإفراط في التخصيص (overfitting).\n\nبعد التدريب، تحتاج إلى تحويل أوزان LoRA إلى أوزان عادية قبل إجراء الاستدلال.\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/openaudio-s1-mini \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/openaudio-s1-mini-yth-lora/\n```\n!!! note \"ملاحظة\"\n    يمكنك أيضًا تجربة نقاط تحقق (checkpoints) أخرى. نقترح استخدام أقدم نقطة تحقق تلبي متطلباتك، حيث إنها غالبًا ما تؤدي أداءً أفضل على البيانات خارج التوزيع (OOD).\n"
  },
  {
    "path": "docs/ar/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><a href=\"../en/\">English</a> | <a href=\"../zh/\">简体中文</a> | <a href=\"../pt/\">Portuguese</a> | <a href=\"../ja/\">日本語</a> | <a href=\"../ko/\">한국어</a> | <strong>العربية</strong></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"تنبيه الترخيص\"\n    يتم إصدار قاعدة الأكواد هذه وأوزان النماذج المرتبطة بها بموجب رخصة **FISH AUDIO RESEARCH LICENSE**. يرجى الرجوع إلى [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) لمزيد من التفاصيل.\n\n!!! warning \"إخلاء المسؤولية القانونية\"\n    نحن لا نتحمل أي مسؤولية عن أي استخدام غير قانوني لقاعدة الأكواد. يرجى مراجعة القوانين المحلية المتعلقة بـ DMCA والقوانين الأخرى ذات الصلة.\n\n## البدء السريع\n\n### ابدأ من الوثائق\n\nهذه هي الوثائق الرسمية لـ Fish Audio S2، ويمكنك البدء مباشرة عبر الروابط التالية:\n\n- [التثبيت](https://speech.fish.audio/ar/install/)\n- [الاستدلال عبر سطر الأوامر](https://speech.fish.audio/ar/inference/)\n- [استدلال WebUI](https://speech.fish.audio/ar/inference/)\n- [الاستدلال عبر الخادم](https://speech.fish.audio/ar/server/)\n- [إعداد Docker](https://speech.fish.audio/ar/install/)\n\n> [!IMPORTANT]\n> **بالنسبة لخادم SGLang، راجع [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### دليل وكلاء LLM\n\n```\nقم بتثبيت وإعداد Fish Audio S2 باتباع التعليمات في https://speech.fish.audio/ar/install/ .\n```\n\n## Fish Audio S2\n**أفضل نظام لتحويل النص إلى كلام بين الأنظمة مفتوحة المصدر ومغلقة المصدر**\n\nFish Audio S2 هو أحدث نموذج من [Fish Audio](https://fish.audio/). تم تدريبه على أكثر من 10 ملايين ساعة صوتية عبر نحو 50 لغة، ويجمع بين المواءمة بالتعلم المعزز وبنية Dual-Autoregressive لإنتاج كلام طبيعي وواقعي وغني بالتعبير العاطفي.\n\nيدعم S2 التحكم الدقيق في النبرة والعاطفة داخل النص نفسه باستخدام وسوم باللغة الطبيعية مثل `[laugh]` و`[whispers]` و`[super happy]`، كما يدعم بشكل أصيل توليد متحدثين متعددين وحوارات متعددة الأدوار.\n\nيمكنك تجربة النموذج مباشرة عبر [موقع Fish Audio](https://fish.audio/)، وقراءة المزيد في [منشور المدونة](https://fish.audio/blog/fish-audio-open-sources-s2/) و[التقرير التقني](https://arxiv.org/abs/2603.08823).\n\n### إصدارات النموذج\n\n| النموذج | الحجم | التوفر | الوصف |\n|------|------|-------------|-------------|\n| S2-Pro | 4B معلمة | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | نموذج رائد كامل الميزات بأعلى مستوى من الجودة والاستقرار |\n\nيمكن العثور على مزيد من التفاصيل في [التقرير التقني](https://arxiv.org/abs/2411.01156).\n\n## نتائج القياس المعياري\n\n| المعيار | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (الصينية) | **0.54%** (الأفضل إجمالاً) |\n| Seed-TTS Eval — WER (الإنجليزية) | **0.99%** (الأفضل إجمالاً) |\n| Audio Turing Test (مع التعليمات) | **0.515** المتوسط البعدي |\n| EmergentTTS-Eval — معدل الفوز | **81.88%** (الأعلى إجمالاً) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — الجودة | **4.51 / 5.0** |\n| متعدد اللغات (MiniMax Testset) — أفضل WER | **11 من 24** لغة |\n| متعدد اللغات (MiniMax Testset) — أفضل SIM | **17 من 24** لغة |\n\nفي Seed-TTS Eval، حقق S2 أقل WER بين جميع النماذج التي تم تقييمها، بما في ذلك الأنظمة المغلقة: Qwen3-TTS ‏(0.77/1.24)، وMiniMax Speech-02 ‏(0.99/1.90)، وSeed-TTS ‏(1.12/2.25). وفي Audio Turing Test، تفوقت قيمة 0.515 على Seed-TTS ‏(0.417) بنسبة 24% وعلى MiniMax-Speech ‏(0.387) بنسبة 33%. وفي EmergentTTS-Eval، حقق S2 نتائج قوية بشكل خاص في الخصائص شبه اللغوية (91.61%)، والأسئلة (84.41%)، والتعقيد النحوي (83.39%).\n\n## أبرز المميزات\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### تحكم مضمّن دقيق عبر اللغة الطبيعية\n\nيتيح Fish Audio S2 تحكمًا موضعيًا في توليد الكلام من خلال تضمين تعليمات باللغة الطبيعية مباشرة عند مواقع كلمات أو عبارات محددة داخل النص. وبدلًا من الاعتماد على مجموعة ثابتة من الوسوم المُعرّفة مسبقًا، يقبل S2 أوصافًا نصية حرة مثل [whisper in small voice] أو [professional broadcast tone] أو [pitch up]، مما يتيح تحكمًا مفتوحًا في التعبير على مستوى الكلمة.\n\n### بنية Dual-Autoregressive\n\nيعتمد S2 على Transformer أحادي الاتجاه (Decoder-only) مع مُرمّز صوتي قائم على RVQ (عدد 10 codebooks وبمعدل إطارات يقارب 21 هرتز). وتُقسّم بنية Dual-AR عملية التوليد إلى مرحلتين:\n\n- **Slow AR** يعمل على المحور الزمني ويتنبأ بالـ semantic codebook الأساسي.\n- **Fast AR** يولّد الـ 9 residual codebooks المتبقية في كل خطوة زمنية لإعادة بناء التفاصيل الصوتية الدقيقة.\n\nهذا التصميم غير المتماثل (4B معلمة على المحور الزمني و400M على محور العمق) يرفع كفاءة الاستدلال مع الحفاظ على جودة الصوت.\n\n### المواءمة بالتعلم المعزز\n\nيستخدم S2 خوارزمية Group Relative Policy Optimization (GRPO) للمواءمة بعد التدريب. ويتم إعادة استخدام نفس النماذج التي استُخدمت لتصفية بيانات التدريب وتعليقها كنماذج مكافأة في التعلم المعزز مباشرة، مما يلغي عدم تطابق التوزيع بين بيانات ما قبل التدريب وأهداف ما بعد التدريب. وتجمع إشارة المكافأة بين الدقة الدلالية، والالتزام بالتعليمات، وتقييم التفضيل الصوتي، وتشابه النبرة.\n\n### البث الإنتاجي عبر SGLang\n\nلأن بنية Dual-AR متماثلة بنيويًا مع نماذج LLM autoregressive القياسية، فإن S2 يرث مباشرة تحسينات الخدمة الأصلية في SGLang، بما في ذلك: continuous batching، وpaged KV cache، وCUDA graph replay، وprefix caching المعتمد على RadixAttention.\n\nعلى بطاقة NVIDIA H200 واحدة:\n\n- **عامل الزمن الحقيقي (RTF):** 0.195\n- **الزمن حتى أول مقطع صوتي:** حوالي 100 مللي ثانية\n- **معدل المعالجة:** أكثر من 3,000 acoustic tokens/s مع الحفاظ على RTF أقل من 0.5\n\n### دعم لغات متعددة\n\nيدعم Fish Audio S2 تحويل النص إلى كلام بجودة عالية ولغات متعددة دون الحاجة إلى رموز صوتية أو معالجة مسبقة خاصة بكل لغة. بما في ذلك:\n\n**الإنجليزية، الصينية، اليابانية، الكورية، العربية، الألمانية، الفرنسية...**\n\n**وأكثر من ذلك بكثير!**\n\nالقائمة في توسع مستمر، تحقق من [Fish Audio](https://fish.audio/) لمعرفة أحدث الإصدارات.\n\n### توليد أصلي لمتحدثين متعددين\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nيسمح Fish Audio S2 للمستخدمين برفع صوت مرجعي يحتوي على متحدثين متعددين، وسيتعامل النموذج مع ميزات كل متحدث عبر رمز `<|speaker:i|>`. يمكنك بعد ذلك التحكم في أداء النموذج باستخدام رمز معرف المتحدث، مما يسمح بتوليد واحد يتضمن متحدثين متعددين. لم تعد بحاجة لرفع ملفات مرجعية منفصلة لكل متحدث.\n\n### توليد حوارات متعددة الأدوار\n\nبفضل توسيع سياق النموذج، يمكن لنموذجنا الآن استخدام المعلومات السابقة لتحسين التعبير في المحتوى المولد لاحقاً، مما يزيد من طبيعية المحتوى.\n\n### استنساخ صوت سريع\n\nيدعم Fish Audio S2 استنساخ الصوت بدقة باستخدام عينة مرجعية قصيرة (عادةً 10-30 ثانية). يلتقط النموذج نبرة الصوت، وأسلوب التحدث، والميول العاطفية، مما ينتج أصواتاً مستنسخة واقعية ومتسقة دون الحاجة إلى ضبط دقيق إضافي.\nلاستخدام خادم SGLang، راجع [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) .\n\n---\n\n## شكر وتقدير\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## التقرير التقني\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/ar/inference.md",
    "content": "# الاستنتاج\n\nيتطلب نموذج Fish Audio S2 ذاكرة فيديو (VRAM) كبيرة. نوصي باستخدام وحدة معالجة رسومات (GPU) بسعة 24 جيجابايت على الأقل للاستنتاج.\n\n## تحميل الأوزان\n\nأولاً ، تحتاج إلى تحميل أوزان النموذج:\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## الاستنتاج عبر خط الأوامر\n\n!!! note\n    إذا كنت تخطط لترك النموذج يختار نغمة الصوت عشوائيًا ، فيمكنك تخطي هذه الخطوة.\n\n### 1. الحصول على رموز VQ من الصوت المرجعي\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\nيجب أن تحصل على `fake.npy` و `fake.wav`.\n\n### 2. توليد الرموز الدلالية (Semantic tokens) من النص:\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"النص الذي تريد تحويله\" \\\n    --prompt-text \"النص المرجعي الخاص بك\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\nسيقوم هذا الأمر بإنشاء ملف `codes_N` في دليل العمل ، حيث N هو عدد صحيح يبدأ من 0.\n\n!!! note\n    قد ترغب في استخدام `--compile` لدمج نوى CUDA لاستنتاج أسرع. ومع ذلك ، نوصي باستخدام تحسين تسريع الاستنتاج sglang الخاص بنا.\n    بالمقابل ، إذا كنت لا تخطط لاستخدام التسريع ، يمكنك التعليق على معلمة `--compile`.\n\n!!! info\n    بالنسبة لوحدات معالجة الرسومات التي لا تدعم bf16 ، قد تحتاج إلى استخدام معلمة `--half`.\n\n### 3. توليد الصوت من الرموز الدلالية:\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\nبعد ذلك ستحصل على ملف `fake.wav`.\n\n## استنتاج WebUI\n\n### 1. Gradio WebUI\n\nللحفاظ على التوافق، ما زلنا نحتفظ بواجهة Gradio WebUI السابقة.\n\n```bash\npython tools/run_webui.py # --compile إذا كنت بحاجة إلى تسريع\n```\n\n### 2. Awesome WebUI\n\nتعد Awesome WebUI واجهة ويب حديثة تعتمد على TypeScript، وتوفر ميزات أغنى وتجربة مستخدم أفضل.\n\n**بناء WebUI:**\n\nيجب أن يكون لديك Node.js و npm مثبتين على جهازك المحلي أو الخادم.\n\n1. ادخل إلى دليل `awesome_webui`:\n   ```bash\n   cd awesome_webui\n   ```\n2. تثبيت التبعيات:\n   ```bash\n   npm install\n   ```\n3. بناء WebUI:\n   ```bash\n   npm run build\n   ```\n\n**بدء تشغيل خادم الخلفية:**\n\nبعد بناء WebUI، عد إلى دليل جذر المشروع وقم بتشغيل خادم API:\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**الوصول:**\n\nبمجرد تشغيل الخادم، يمكنك الوصول إليه عبر المتصفح على العنوان التالي:\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/ar/install.md",
    "content": "## المتطلبات\n\n- ذاكرة وحدة معالجة الرسومات (GPU): 24 جيجابايت (للاستدلال)\n- النظام: Linux, WSL\n\n## إعداد النظام\n\nيدعم Fish Audio S2 طرق تثبيت متعددة. اختر الطريقة التي تناسب بيئة التطوير الخاصة بك.\n\n**المتطلبات الأساسية**: قم بتثبيت تبعيات النظام لمعالجة الصوت:\n``` bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# تثبيت نسخة GPU (اختر إصدار CUDA الخاص بك: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# تثبيت نسخة CPU فقط\npip install -e .[cpu]\n\n# التثبيت الافتراضي (يستخدم فهرس PyTorch الافتراضي)\npip install -e .\n\n# إذا واجهت خطأ أثناء التثبيت بسبب pyaudio، ففكر في استخدام الأمر التالي:\n# conda install pyaudio\n# ثم قم بتشغيل pip install -e . مرة أخرى\n```\n\n### UV\n\nيوفر UV حلاً أسرع لتثبيت التبعيات:\n\n```bash\n# تثبيت نسخة GPU (اختر إصدار CUDA الخاص بك: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# تثبيت نسخة CPU فقط\nuv sync --python 3.12 --extra cpu\n```\n### دعم Intel Arc XPU\n\nلمستخدمي وحدات معالجة الرسومات Intel Arc، قم بالتثبيت مع دعم XPU على النحو التالي:\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# تثبيت مكتبة C++ القياسية المطلوبة\nconda install libstdcxx -c conda-forge\n\n# تثبيت PyTorch مع دعم Intel XPU\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# تثبيت Fish Speech\npip install -e .\n```\n\n!!! warning\n    خيار `compile` غير مدعوم على أنظمة Windows و macOS. إذا كنت ترغب في التشغيل مع التجميع، ستحتاج إلى تثبيت Triton بنفسك.\n\n\n## إعداد Docker\n\nيوفر نموذج سلسلة Fish Audio S2 خيارات نشر متعددة مع Docker لتلبية الاحتياجات المختلفة. يمكنك استخدام الصور المعدة مسبقًا من Docker Hub، أو البناء محليًا باستخدام Docker Compose، أو بناء صور مخصصة يدويًا.\n\nلقد قدمنا صور Docker لكل من واجهة المستخدم الرسومية (WebUI) وخادم API، لكل من وحدات معالجة الرسومات (GPU) (CUDA 12.6 افتراضيًا) ووحدات المعالجة المركزية (CPU). يمكنك استخدام الصور المعدة مسبقًا من Docker Hub، أو البناء محليًا باستخدام Docker Compose، أو بناء صور مخصصة يدويًا. إذا كنت ترغب في البناء محليًا، فاتبع الإرشادات أدناه. إذا كنت ترغب فقط في استخدام الصور المعدة مسبقًا، فاتبع مباشرةً [دليل الاستدلال](inference.md).\n\n### المتطلبات الأساسية\n\n- تثبيت Docker و Docker Compose\n- تثبيت NVIDIA Docker runtime (لدعم GPU)\n- ذاكرة GPU لا تقل عن 24 جيجابايت للاستدلال باستخدام CUDA\n\n### استخدام Docker Compose\n\nللتطوير أو التخصيص، يمكنك استخدام Docker Compose للبناء والتشغيل محليًا:\n\n```bash\n# أولاً، استنسخ المستودع\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# بدء واجهة المستخدم الرسومية (WebUI) مع CUDA\ndocker compose --profile webui up\n\n# بدء واجهة المستخدم الرسومية (WebUI) مع تحسين التجميع\nCOMPILE=1 docker compose --profile webui up\n\n# بدء خادم API\ndocker compose --profile server up\n\n# بدء خادم API مع تحسين التجميع\nCOMPILE=1 docker compose --profile server up\n\n# النشر باستخدام CPU فقط\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### متغيرات البيئة لـ Docker Compose\n\nيمكنك تخصيص النشر باستخدام متغيرات البيئة:\n\n```bash\n# مثال على ملف .env\nBACKEND=cuda              # أو cpu\nCOMPILE=1                 # تمكين تحسين التجميع\nGRADIO_PORT=7860         # منفذ واجهة المستخدم الرسومية (WebUI)\nAPI_PORT=8080            # منفذ خادم API\nUV_VERSION=0.8.15        # إصدار مدير الحزم UV\n```\n\nسيقوم الأمر ببناء الصورة وتشغيل الحاوية. يمكنك الوصول إلى واجهة المستخدم الرسومية (WebUI) على `http://localhost:7860` وخادم API على `http://localhost:8080`.\n\n### البناء اليدوي باستخدام Docker\n\nللمستخدمين المتقدمين الذين يرغبون في تخصيص عملية البناء:\n\n```bash\n# بناء صورة واجهة المستخدم الرسومية (WebUI) مع دعم CUDA\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# بناء صورة خادم API مع دعم CUDA\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# بناء صورة CPU فقط (تدعم منصات متعددة)\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# بناء صورة التطوير\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### وسيطات البناء\n\n- `BACKEND`: `cuda` أو `cpu` (الافتراضي: `cuda`)\n- `CUDA_VER`: إصدار CUDA (الافتراضي: `12.6.0`)\n- `UV_EXTRA`: حزمة UV إضافية لـ CUDA (الافتراضي: `cu126`)\n- `UBUNTU_VER`: إصدار Ubuntu (الافتراضي: `24.04`)\n- `PY_VER`: إصدار Python (الافتراضي: `3.12`)\n\n### تحميل المجلدات\n\nتتطلب كلتا الطريقتين تحميل المجلدات التالية:\n\n- `./checkpoints:/app/checkpoints` - مجلد أوزان النموذج\n- `./references:/app/references` - مجلد ملفات الصوت المرجعية\n\n### متغيرات البيئة\n\n- `COMPILE=1` - تمكين `torch.compile` لتسريع الاستدلال (حوالي 10 أضعاف)\n- `GRADIO_SERVER_NAME=0.0.0.0` - مضيف خادم واجهة المستخدم الرسومية (WebUI)\n- `GRADIO_SERVER_PORT=7860` - منفذ خادم واجهة المستخدم الرسومية (WebUI)\n- `API_SERVER_NAME=0.0.0.0` - مضيف خادم API\n- `API_SERVER_PORT=8080` - منفذ خادم API\n\n!!! note\n    تتوقع حاويات Docker أن يتم تحميل أوزان النموذج في `/app/checkpoints`. تأكد من تنزيل أوزان النموذج المطلوبة قبل بدء الحاويات.\n\n!!! warning\n    يتطلب دعم GPU وجود NVIDIA Docker runtime. للنشر باستخدام CPU فقط، قم بإزالة علامة `--gpus all` واستخدم صور CPU.\n"
  },
  {
    "path": "docs/en/finetune.md",
    "content": "# Fine-tuning\n\n!!! warning \n    We highly do note recoomand users to do fine-tuning on an RL trained model. Fine-tuning a model after RL can shift the model distribution, which may lead to degraded performance.\n\nIn the current version, you only need to finetune the 'LLAMA' part.\n\n## Fine-tuning LLAMA\n### 1. Prepare the dataset\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\nYou need to convert your dataset into the above format and place it under `data`. The audio file can have the extensions `.mp3`, `.wav`, or `.flac`, and the annotation file should have the extension `.lab`.\n\n!!! info\n    The `.lab` annotation file only needs to contain the transcription of the audio, with no special formatting required. For example, if `hi.mp3` says \"Hello, goodbye,\" then the `hi.lab` file would contain a single line of text: \"Hello, goodbye.\"\n\n!!! warning\n    It's recommended to apply loudness normalization to the dataset. You can use [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) to do this.\n\n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n\n### 2. Batch extraction of semantic tokens\n\nMake sure you have downloaded the VQGAN weights. If not, run the following command:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nYou can then run the following command to extract semantic tokens:\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n```\n\n!!! note\n    You can adjust `--num-workers` and `--batch-size` to increase extraction speed, but please make sure not to exceed your GPU memory limit.\n\nThis command will create `.npy` files in the `data` directory, as shown below:\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy\n```\n\n### 3. Pack the dataset into protobuf\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\nAfter the command finishes executing, you should see the `protos` file in the `data` directory.\n\n### 4. Finally, fine-tuning with LoRA\n\nSimilarly, make sure you have downloaded the `LLAMA` weights. If not, run the following command:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nFinally, you can start the fine-tuning by running the following command:\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note\n    You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech/configs/text2semantic_finetune.yaml`.\n\n!!! note\n    For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues.\n\nAfter training is complete, you can refer to the [inference](inference.md) section to test your model.\n\n!!! info\n    By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability.\n    If you want to learn the timbre, you can increase the number of training steps, but this may lead to overfitting.\n\nAfter training, you need to convert the LoRA weights to regular weights before performing inference.\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/openaudio-s1-mini \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/openaudio-s1-mini-yth-lora/\n```\n!!! note\n    You may also try other checkpoints. We suggest using the earliest checkpoint that meets your requirements, as they often perform better on out-of-distribution (OOD) data.\n"
  },
  {
    "path": "docs/en/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><strong>English</strong> | <a href=\"../zh/\">简体中文</a> | <a href=\"../pt/\">Portuguese</a> | <a href=\"../ja/\">日本語</a> | <a href=\"../ko/\">한국어</a> | <a href=\"../ar/\">العربية</a></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"License Notice\"\n    This codebase and its associated model weights are released under **FISH AUDIO RESEARCH LICENSE**. Please refer to [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) for more details. We will take action against any violation of the license.\n\n!!! warning \"Legal Disclaimer\"\n    We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws.\n\n## Quick Start\n\n### For Human\n\nHere are the official documents for Fish Audio S2, follow the instructions to get started easily.\n\n- [Installation](https://speech.fish.audio/install/)\n- [Command Line Inference](https://speech.fish.audio/inference/#command-line-inference)\n- [WebUI Inference](https://speech.fish.audio/inference/#webui-inference)\n- [Server Inference](https://speech.fish.audio/server/)\n- [Docker Setup](https://speech.fish.audio/install/#docker-setup)\n\n> [!IMPORTANT]\n> **For SGLang server, please read [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### For LLM Agent\n\n```\nInstall and configure Fish-Audio S2 by following the instructions here: https://speech.fish.audio/install/\n```\n\n## Fish Audio S2  \n**Best text-to-speech system among both open source and closed source**\n\nFish Audio S2 is the latest model developed by [Fish Audio](https://fish.audio/). Trained on over 10 million hours of audio across approximately 50 languages, S2 combines reinforcement learning alignment with a Dual-Autoregressive architecture to generate speech that sounds natural, realistic, and emotionally rich.\n\nS2 supports fine-grained inline control of prosody and emotion using natural-language tags like `[laugh]`, `[whispers]`, and `[super happy]`, as well as native multi-speaker and multi-turn generation.\n\nVisit the [Fish Audio website](https://fish.audio/) for live playground. Read the [blog post](https://fish.audio/blog/fish-audio-open-sources-s2/) and [technical report](https://arxiv.org/abs/2603.08823) for more details.\n\n### Model Variants\n\n| Model | Size | Availability | Description |\n|------|------|-------------|-------------|\n| S2-Pro | 4B parameters | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Full-featured flagship model with maximum quality and stability |\n\nMore details of the model can be found in the [technical report](https://arxiv.org/abs/2411.01156).\n\n## Benchmark Results\n\n| Benchmark | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (Chinese) | **0.54%** (best overall) |\n| Seed-TTS Eval — WER (English) | **0.99%** (best overall) |\n| Audio Turing Test (with instruction) | **0.515** posterior mean |\n| EmergentTTS-Eval — Win Rate | **81.88%** (highest overall) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — Quality | **4.51 / 5.0** |\n| Multilingual (MiniMax Testset) — Best WER | **11 of 24** languages |\n| Multilingual (MiniMax Testset) — Best SIM | **17 of 24** languages |\n\nOn Seed-TTS Eval, S2 achieves the lowest WER among all evaluated models including closed-source systems: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). On the Audio Turing Test, 0.515 surpasses Seed-TTS (0.417) by 24% and MiniMax-Speech (0.387) by 33%. On EmergentTTS-Eval, S2 achieves particularly strong results in paralinguistics (91.61% win rate), questions (84.41%), and syntactic complexity (83.39%).\n\n## Highlights\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### Fine-Grained Inline Control via Natural Language\n\nS2 enables localized control over speech generation by embedding natural-language instructions directly at specific word or phrase positions within the text. Rather than relying on a fixed set of predefined tags, S2 accepts free-form textual descriptions — such as `[whisper in small voice]`, `[professional broadcast tone]`, or `[pitch up]` — allowing open-ended expression control at the word level.\n\n### Dual-Autoregressive Architecture\n\nS2 builds on a decoder-only transformer combined with an RVQ-based audio codec (10 codebooks, ~21 Hz frame rate). The Dual-AR architecture splits generation into two stages:\n\n- **Slow AR** operates along the time axis and predicts the primary semantic codebook.\n- **Fast AR** generates the remaining 9 residual codebooks at each time step, reconstructing fine-grained acoustic detail.\n\nThis asymmetric design — 4B parameters along the time axis, 400M parameters along the depth axis — keeps inference efficient while preserving audio fidelity.\n\n### Reinforcement Learning Alignment\n\nS2 uses Group Relative Policy Optimization (GRPO) for post-training alignment. The same models used to filter and annotate training data are directly reused as reward models during RL — eliminating distribution mismatch between pre-training data and post-training objectives. The reward signal combines semantic accuracy, instruction adherence, acoustic preference scoring, and timbre similarity.\n\n### Production Streaming via SGLang\n\nBecause the Dual-AR architecture is structurally isomorphic to standard autoregressive LLMs, S2 directly inherits all LLM-native serving optimizations from SGLang — including continuous batching, paged KV cache, CUDA graph replay, and RadixAttention-based prefix caching.\n\nOn a single NVIDIA H200 GPU:\n\n- **Real-Time Factor (RTF):** 0.195\n- **Time-to-first-audio:** ~100 ms\n- **Throughput:** 3,000+ acoustic tokens/s while maintaining RTF below 0.5\n\n### Multilingual Support\n\nS2 supports high-quality multilingual text-to-speech without requiring phonemes or language-specific preprocessing. Including:\n\n**English, Chinese, Japanese, Korean, Arabics, German, French...**\n\n**AND MORE!**\n\nThe list is constantly expanding, check [Fish Audio](https://fish.audio/) for the latest releases.\n\n### Native Multi-Speaker Generation\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nFish Audio S2 allows users to upload reference audio with multi-speaker, the model will deal with every speaker's feature via `<|speaker:i|>` token. Then you can control the model's performance with the speaker id token, allowing a single generation to include multiple speakers. You no longer need to upload reference audio separately for each speaker.\n\n### Multi-Turn Generation\n\nThanks to the expansion of the model context, our model can now use previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the content.\n\n### Rapid Voice Cloning\n\nFish Audio S2 supports accurate voice cloning using a short reference sample (typically 10–30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning.\nPlease refer to [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) to use the SGLang server.\n---\n\n## Credits\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## Tech Report\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/en/inference.md",
    "content": "# Inference\n\nThe Fish Audio S2 model requires a large amount of VRAM. We recommend using a GPU with at least 24GB for inference.\n\n## Download Weights\n\nFirst, you need to download the model weights:\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## Command Line Inference\n\n!!! note\n    If you plan to let the model randomly choose a voice timbre, you can skip this step.\n\n### 1. Get VQ tokens from reference audio\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\nYou should get a `fake.npy` and a `fake.wav`.\n\n### 2. Generate Semantic tokens from text:\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"The text you want to convert\" \\\n    --prompt-text \"Your reference text\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\nThis command will create a `codes_N` file in the working directory, where N is an integer starting from 0.\n\n!!! note\n    You may want to use `--compile` to fuse CUDA kernels for faster inference. However, we recommend using our sglang inference acceleration optimization.\n    Correspondingly, if you do not plan to use acceleration, you can comment out the `--compile` parameter.\n\n!!! info\n    For GPUs that do not support bf16, you may need to use the `--half` parameter.\n\n### 3. Generate vocals from semantic tokens:\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\nAfter that, you will get a `fake.wav` file.\n\n## WebUI Inference\n\n### 1. Gradio WebUI\n\nFor compatibility, we still maintain the Gradio WebUI.\n\n```bash\npython tools/run_webui.py # --compile if you need acceleration\n```\n\n### 2. Awesome WebUI\n\nAwesome WebUI is a modernized Web interface built with TypeScript, offering richer features and a better user experience.\n\n**Build WebUI:**\n\nYou need to have Node.js and npm installed on your local machine or server.\n\n1. Enter the `awesome_webui` directory:\n   ```bash\n   cd awesome_webui\n   ```\n2. Install dependencies:\n   ```bash\n   npm install\n   ```\n3. Build the WebUI:\n   ```bash\n   npm run build\n   ```\n\n**Start Backend Server:**\n\nAfter building the WebUI, return to the project root and start the API server:\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**Access:**\n\nOnce the server is running, you can access it via your browser:\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/en/install.md",
    "content": "## Requirements\n\n- GPU Memory: 24GB (Inference)\n- System: Linux, WSL\n\n## System Setup\n\nFish Audio S2 supports multiple installation methods. Choose the one that best fits your development environment.\n\n**Prerequisites**: Install system dependencies for audio processing:\n``` bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU installation (choose your CUDA version: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU-only installation\npip install -e .[cpu]\n\n# Default installation (uses PyTorch default index)\npip install -e .\n\n# If you encounter an error during installation due to pyaudio, consider using the following command:\n# conda install pyaudio\n# Then run pip install -e . again\n```\n\n### UV\n\nUV provides faster dependency resolution and installation:\n\n```bash\n# GPU installation (choose your CUDA version: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU-only installation\nuv sync --python 3.12 --extra cpu\n```\n### Intel Arc XPU support\n\nFor Intel Arc GPU users, install with XPU support:\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Install required C++ standard library\nconda install libstdcxx -c conda-forge\n\n# Install PyTorch with Intel XPU support\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Install Fish Speech\npip install -e .\n```\n\n!!! warning\n    The `compile` option is not supported on Windows and macOS. If you want to run with compile, you need to install Triton manually.\n\n\n## Docker Setup\n\nFish Audio S2 series model provides multiple Docker deployment options to suit different needs. You can use pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images.\n\nWe provide Docker images for both WebUI and API server on both GPU (CUDA126 by default) and CPU. You can use the pre-built images from Docker Hub, build locally with Docker Compose, or manually build custom images. If you want to build locally, follow the instructions below. If you only want to use pre-built images, follow the [inference guide](inference.md).\n\n### Prerequisites\n\n- Docker and Docker Compose installed\n- NVIDIA Docker runtime (for GPU support)\n- At least 24GB GPU memory for CUDA inference\n\n# Use docker compose\n\nFor development or customization, you can use Docker Compose to build and run locally:\n\n```bash\n# Clone the repository first\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# Start WebUI with CUDA\ndocker compose --profile webui up\n\n# Start WebUI with compile optimization\nCOMPILE=1 docker compose --profile webui up\n\n# Start API server\ndocker compose --profile server up\n\n# Start API server with compile optimization  \nCOMPILE=1 docker compose --profile server up\n\n# For CPU-only deployment\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### Environment Variables for Docker Compose\n\nYou can customize the deployment using environment variables:\n\n```bash\n# .env file example\nBACKEND=cuda              # or cpu\nCOMPILE=1                 # Enable compile optimization\nGRADIO_PORT=7860         # WebUI port\nAPI_PORT=8080            # API server port\nUV_VERSION=0.8.15        # UV package manager version\n```\n\nThe command will build the image and run the container. You can access the WebUI at `http://localhost:7860` and the API server at `http://localhost:8080`.\n\n### Manual Docker Build\n\nFor advanced users who want to customize the build process:\n\n```bash\n# Build WebUI image with CUDA support\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# Build API server image with CUDA support\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# Build CPU-only images (supports multi-platform)\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# Build development image\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### Build Arguments\n\n- `BACKEND`: `cuda` or `cpu` (default: `cuda`)\n- `CUDA_VER`: CUDA version (default: `12.6.0`)\n- `UV_EXTRA`: UV extra for CUDA (default: `cu126`)\n- `UBUNTU_VER`: Ubuntu version (default: `24.04`)\n- `PY_VER`: Python version (default: `3.12`)\n\n### Volume Mounts\n\nBoth methods require mounting these directories:\n\n- `./checkpoints:/app/checkpoints` - Model weights directory\n- `./references:/app/references` - Reference audio files directory\n\n### Environment Variables\n\n- `COMPILE=1` - Enable torch.compile for faster inference (~10x speedup)\n- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI server host\n- `GRADIO_SERVER_PORT=7860` - WebUI server port\n- `API_SERVER_NAME=0.0.0.0` - API server host  \n- `API_SERVER_PORT=8080` - API server port\n\n!!! note\n    The Docker containers expect model weights to be mounted at `/app/checkpoints`. Make sure to download the required model weights before starting the containers.\n\n!!! warning\n    GPU support requires NVIDIA Docker runtime. For CPU-only deployment, remove the `--gpus all` flag and use CPU images.\n"
  },
  {
    "path": "docs/en/server.md",
    "content": "# Server\n\nThis page covers server-side inference for Fish Audio S2, plus quick links for WebUI inference and Docker deployment.\n\n## API Server Inference\n\nFish Speech provides an HTTP API server entrypoint at `tools/api_server.py`.\n\n### Start the server locally\n\n```bash\npython tools/api_server.py \\\n  --llama-checkpoint-path checkpoints/s2-pro \\\n  --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \\\n  --listen 0.0.0.0:8080\n```\n\nCommon options:\n\n- `--compile`: enable `torch.compile` optimization\n- `--half`: use fp16 mode\n- `--api-key`: require bearer token authentication\n- `--workers`: set worker process count\n\n### Health check\n\n```bash\ncurl -X GET http://127.0.0.1:8080/v1/health\n```\n\nExpected response:\n\n```json\n{\"status\":\"ok\"}\n```\n\n### Main API endpoint\n\n- `POST /v1/tts` for text-to-speech generation\n- `POST /v1/vqgan/encode` for VQ encode\n- `POST /v1/vqgan/decode` for VQ decode\n\n## WebUI Inference\n\nFor WebUI usage, see:\n\n- [WebUI Inference](https://speech.fish.audio/inference/#webui-inference)\n\n## Docker\n\nFor Docker-based server or WebUI deployment, see:\n\n- [Docker Setup](https://speech.fish.audio/install/#docker-setup)\n\nYou can also start the server profile directly with Docker Compose:\n\n```bash\ndocker compose --profile server up\n```\n"
  },
  {
    "path": "docs/ja/finetune.md",
    "content": "# ファインチューニング\n\nこのページを開いたということは、明らかに、事前学習済みモデルのゼロショット性能に満足していないということでしょう。データセットでより良い性能を発揮するようにモデルをファインチューニングしたいとお考えのはずです。\n\n現在のバージョンでは、「LLAMA」部分のみをファインチューニングする必要があります。\n\n## LLAMA のファインチューニング\n### 1. データセットの準備\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\nデータセットを上記の形式に変換し、`data` ディレクトリに配置する必要があります。音声ファイルの拡張子は `.mp3`、`.wav`、または `.flac` が使用でき、注釈ファイルの拡張子は `.lab` にすることを推奨します。\n\n!!! info\n    `.lab` 注釈ファイルには、音声の書き起こしテキストのみを含める必要があり、特別なフォーマット要件はありません。たとえば、`hi.mp3` の内容が「こんにちは、さようなら。」である場合、`hi.lab` ファイルには「こんにちは、さようなら。」という一行のテキストのみが含まれます。\n\n!!! warning\n    データセットにラウドネス正規化を適用することをお勧めします。これには [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) を使用できます。\n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n### 2. セマンティックトークンの一括抽出\n\nVQGANの重みをダウンロードしていることを確認してください。まだの場合は、次のコマンドを実行してください。\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nその後、次のコマンドを実行してセマンティックトークンを抽出できます。\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n```\n\n!!! note\n    `--num-workers` と `--batch-size` を調整して抽出速度を向上させることができますが、GPUメモリの制限を超えないように注意してください。\n\nこのコマンドは `data` ディレクトリに `.npy` ファイルを作成します。以下のようになります。\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy```\n\n### 3. データセットを protobuf にパックする\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\nコマンドの実行が完了すると、`data` ディレクトリに `protos` ファイルが表示されるはずです。\n\n### 4. 最後に LoRA でファインチューニング\n\n同様に、`LLAMA` の重みをダウンロードしていることを確認してください。まだの場合は、次のコマンドを実行してください。\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\n最後に、次のコマンドを実行してファインチューニングを開始できます。\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note\n    `fish_speech/configs/text2semantic_finetune.yaml` を変更することで、`batch_size` や `gradient_accumulation_steps` などのトレーニングパラメータをGPUメモリに合わせて変更できます。\n\n!!! note\n    Windows ユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。\n\nトレーニングが完了したら、[推論](inference.md) のセクションを参照してモデルをテストできます。\n\n!!! info\n    デフォルト設定では、モデルは話者の発音方法のみを学習し、音色は学習しません。音色の安定性を確保するためには、依然としてプロンプトを使用する必要があります。\n    音色を学習させたい場合は、トレーニングステップ数を増やしてください。ただし、これにより過学習が発生する可能性があります。\n\nトレーニング後、推論を行う前に LoRA の重みを通常の重みに変換する必要があります。\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/openaudio-s1-mini \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/openaudio-s1-mini-yth-lora/\n```\n\n!!! note\n    他のチェックポイントを試すこともできます。要件を満たす最も早いチェックポイントを使用することをお勧めします。これらは通常、OOD（分布外）データに対してより良いパフォーマンスを発揮します。\n"
  },
  {
    "path": "docs/ja/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><a href=\"../en/\">English</a> | <a href=\"../zh/\">简体中文</a> | <a href=\"../pt/\">Portuguese</a> | <strong>日本語</strong> | <a href=\"../ko/\">한국어</a> | <a href=\"../ar/\">العربية</a></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"ライセンス通知\"\n    このコードベースおよび関連するモデルの重みは **FISH AUDIO RESEARCH LICENSE** の下でリリースされています。詳細は [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) を参照してください。\n\n!!! warning \"法的免責事項\"\n    私たちは、コードベースのいかなる違法な使用に対しても責任を負いません。DMCA およびその他の関連法に関する現地の規制を参照してください。\n\n## クイックスタート\n\n### まずはドキュメントから\n\nFish Audio S2 の公式ドキュメントです。以下からすぐに始められます。\n\n- [インストール](https://speech.fish.audio/ja/install/)\n- [コマンドライン推論](https://speech.fish.audio/ja/inference/)\n- [WebUI 推論](https://speech.fish.audio/ja/inference/)\n- [サーバー推論](https://speech.fish.audio/ja/server/)\n- [Docker セットアップ](https://speech.fish.audio/ja/install/)\n\n> [!IMPORTANT]\n> **SGLang サーバーについては [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) を参照してください。**\n\n### LLM Agent 向け\n\n```\nhttps://speech.fish.audio/ja/install/ の手順に従って、Fish Audio S2 をインストール・設定してください。\n```\n\n## Fish Audio S2\n**オープンソースおよびクローズドソースの中で最も優れたテキスト読み上げシステム**\n\nFish Audio S2 は [Fish Audio](https://fish.audio/) が開発した最新モデルです。約 50 言語・1,000 万時間超の音声データで学習され、強化学習アラインメントと Dual-Autoregressive アーキテクチャを組み合わせることで、自然でリアルかつ感情表現豊かな音声を生成します。\n\nS2 は `[laugh]`、`[whispers]`、`[super happy]` といった自然言語タグで、韻律や感情を文中の任意位置で細かく制御できます。さらに、マルチスピーカー生成とマルチターン生成にもネイティブ対応しています。\n\nライブデモは [Fish Audio ウェブサイト](https://fish.audio/) から、詳細は [ブログ記事](https://fish.audio/blog/fish-audio-open-sources-s2/) と [技術レポート](https://arxiv.org/abs/2603.08823) をご覧ください。\n\n### モデルバリアント\n\n| モデル | サイズ | 利用可能性 | 説明 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B パラメータ | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 品質と安定性を最大化したフル機能のフラッグシップモデル |\n\nモデルの詳細は[技術レポート](https://arxiv.org/abs/2411.01156)をご参照ください。\n\n## ベンチマーク結果\n\n| ベンチマーク | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER（中国語） | **0.54%**（全体最良） |\n| Seed-TTS Eval — WER（英語） | **0.99%**（全体最良） |\n| Audio Turing Test（指示あり） | **0.515** 事後平均値 |\n| EmergentTTS-Eval — 勝率 | **81.88%**（全体最高） |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 品質 | **4.51 / 5.0** |\n| 多言語（MiniMax Testset）— 最良 WER | **24 言語中 11 言語** |\n| 多言語（MiniMax Testset）— 最良 SIM | **24 言語中 17 言語** |\n\nSeed-TTS Eval では、S2 はクローズドソースを含む全評価モデルの中で最小 WER を達成しました：Qwen3-TTS（0.77/1.24）、MiniMax Speech-02（0.99/1.90）、Seed-TTS（1.12/2.25）。Audio Turing Test では 0.515 を記録し、Seed-TTS（0.417）比で 24%、MiniMax-Speech（0.387）比で 33% 上回りました。EmergentTTS-Eval では、副言語情報（91.61%）、疑問文（84.41%）、統語的複雑性（83.39%）で特に高い成績を示しています。\n\n## ハイライト\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### 自然言語による細粒度インライン制御\n\nFish Audio S2 では、テキスト内の特定の単語やフレーズ位置に自然言語の指示を直接埋め込むことで、音声生成を局所的に制御できます。固定の事前定義タグに依存するのではなく、S2 は [whisper in small voice]、[professional broadcast tone]、[pitch up] のような自由形式のテキスト記述を受け付け、単語レベルで表現をオープンエンドに制御できます。\n\n### 二重自己回帰（Dual-Autoregressive）アーキテクチャ\n\nS2 はデコーダー専用 Transformer と RVQ ベースの音声コーデック（10 codebooks、約 21 Hz）を組み合わせています。Dual-AR は生成を 2 段階に分割します。\n\n- **Slow AR** は時間軸方向に動作し、主となる semantic codebook を予測。\n- **Fast AR** は各時刻で残り 9 個の residual codebook を生成し、細かな音響ディテールを復元。\n\nこの非対称設計（時間軸 4B パラメータ、深さ軸 400M パラメータ）により、音質を保ちながら推論効率を高めています。\n\n### 強化学習アラインメント\n\nS2 は後学習アラインメントに Group Relative Policy Optimization（GRPO）を採用しています。学習データのフィルタリングとアノテーションに使った同一モデル群を、そのまま RL の報酬モデルとして再利用することで、事前学習データ分布と事後学習目的のミスマッチを抑制しています。報酬信号には、意味的正確性、指示追従性、音響的選好スコア、音色類似度が含まれます。\n\n### SGLang による本番向けストリーミング\n\nDual-AR は構造的に標準的な自己回帰 LLM と同型のため、S2 は SGLang の LLM 向け最適化をそのまま活用できます。たとえば continuous batching、paged KV cache、CUDA graph replay、RadixAttention ベースの prefix caching です。\n\n単一の NVIDIA H200 GPU での実測:\n\n- **RTF（Real-Time Factor）:** 0.195\n- **初回音声出力までの時間:** 約 100 ms\n- **スループット:** RTF 0.5 未満を維持しつつ 3,000+ acoustic tokens/s\n\n### 多言語サポート\n\nFish Audio S2 は、音素や言語固有の前処理を必要とせずに、高品質な多言語テキスト読み上げをサポートします。以下を含みます：\n\n**英語、中国語、日本語、韓国語、アラビア語、ドイツ語、フランス語...**\n\n**さらに多く！**\n\nリストは常に拡大しています。最新のリリースについては [Fish Audio](https://fish.audio/) を確認してください。\n\n### ネイティブなマルチスピーカー生成\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nFish Audio S2 では、ユーザーが複数のスピーカーを含む参照オーディオをアップロードでき、モデルは `<|speaker:i|>` トークンを介して各スピーカーの特徴を処理します。その後、スピーカーIDトークンを使用してモデルのパフォーマンスを制御し、1回の生成で複数のスピーカーを含めることができます。以前のように各スピーカーに対して個別に参照オーディオをアップロードして音声を生成する必要はもうありません。\n\n### マルチターン対話生成\n\nモデルのコンテキストの拡張により、以前の情報を使用して後続の生成されたコンテンツの表現力を向上させ、コンテンツの自然さを高めることができるようになりました。\n\n### 高速音声クローニング\n\nFish Audio S2 は、短い参照サンプル（通常10〜30秒）を使用した正確な音声クローニングをサポートしています。モデルは音色、話し方、感情的な傾向を捉え、追加の微調整なしでリアルで一貫したクローン音声を生成します。\nSGLang サーバーの利用については [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) を参照してください。\n\n---\n\n## クレジット\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 技術レポート\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/ja/inference.md",
    "content": "# 推論\n\nFish Audio S2 モデルは大きなビデオメモリを必要とします。推論には少なくとも 24GB の GPU を使用することをお勧めします。\n\n## 重みのダウンロード\n\nまず、モデルの重みをダウンロードする必要があります：\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## コマンドライン推論\n\n!!! note\n    モデルに音声をランダムに選択させる場合は、このステップをスキップできます。\n\n### 1. リファレンスオーディオから VQ トークンを取得する\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\n`fake.npy` と `fake.wav` が生成されるはずです。\n\n### 2. テキストから Semantic トークンを生成する：\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"変換したいテキスト\" \\\n    --prompt-text \"リファレンステキスト\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\nこのコマンドは、作業ディレクトリに `codes_N` ファイルを作成します。ここで N は 0 から始まる整数です。\n\n!!! note\n    より高速な推論のために CUDA カーネルを融合する `--compile` を使用したい場合がありますが、私たちの sglang 推論加速最適化を使用することをお勧めします。\n    同様に、加速を使用する予定がない場合は、`--compile` パラメータをコメントアウトしてください。\n\n!!! info\n    bf16 をサポートしていない GPU の場合、`--half` パラメータを使用する必要があるかもしれません。\n\n### 3. セマンティックトークンから音声を生成する：\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\nその後、`fake.wav` ファイルが取得できます。\n\n## WebUI 推論\n\n### 1. Gradio WebUI\n\n互換性を維持するため、以前の Gradio WebUI も引き続き利用可能です。\n\n```bash\npython tools/run_webui.py # 加速が必要な場合は --compile\n```\n\n### 2. Awesome WebUI\n\nAwesome WebUI は TypeScript で開発された、より豊富な機能と優れたユーザー体験を提供する最新の Web インターフェースです。\n\n**WebUI のビルド：**\n\nローカルまたはサーバーに Node.js と npm がインストールされている必要があります。\n\n1. `awesome_webui` ディレクトリに移動します：\n   ```bash\n   cd awesome_webui\n   ```\n2. 依存関係をインストールします：\n   ```bash\n   npm install\n   ```\n3. WebUI をビルドします：\n   ```bash\n   npm run build\n   ```\n\n**バックエンドサーバーの起動：**\n\nWebUI のビルドが完了したら、プロジェクトのルートに戻り、API サーバーを起動します：\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**アクセス：**\n\nサーバーが起動したら、ブラウザから以下のアドレスにアクセスして体験できます：\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/ja/install.md",
    "content": "## 必要条件\n\n- GPUメモリ: 24GB (推論時)\n- システム: Linux, WSL\n\n## システムセットアップ\n\nFish Audio S2は複数のインストール方法をサポートしています。ご自身の開発環境に最も適した方法をお選びください。\n\n**前提条件**: 音声処理のためのシステム依存関係をインストールします:\n``` bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU版のインストール (CUDAバージョンを選択: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU版のみのインストール\npip install -e .[cpu]\n\n# デフォルトインストール (PyTorchのデフォルトインデックスを使用)\npip install -e .\n\n# pyaudioのインストールでエラーが発生する場合は、以下のコマンドを試してください：\n# conda install pyaudio\n# その後、再度 pip install -e . を実行してください\n```\n\n### UV\n\nUVはより高速な依存関係の解決とインストールを実現します:\n\n```bash\n# GPU版のインストール (CUDAバージョンを選択: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU版のみのインストール\nuv sync --python 3.12 --extra cpu\n```\n### Intel Arc XPU サポート\n\nIntel Arc GPUユーザーは、以下の手順でXPUサポートをインストールしてください:\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# 必要なC++標準ライブラリをインストール\nconda install libstdcxx -c conda-forge\n\n# Intel XPU対応のPyTorchをインストール\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Fish Speechのインストール\npip install -e .\n```\n\n!!! warning\n    `compile`オプションはWindowsとmacOSではサポートされていません。コンパイルを有効にして実行したい場合は、ご自身でTritonをインストールする必要があります。\n\n\n## Dockerセットアップ\n\nFish Audio S2シリーズモデルは、さまざまなニーズに応えるため複数のDockerデプロイメントオプションを提供しています。Docker Hubのビルド済みイメージを使用するか、Docker Composeでローカルビルドするか、手動でカスタムイメージをビルドすることができます。\n\nWebUIとAPIサーバーの両方について、GPU（デフォルトはCUDA 12.6）版とCPU版のDockerイメージを提供しています。Docker Hubのビルド済みイメージを使用するか、Docker Composeでローカルビルドするか、手動でカスタムイメージをビルドするかを選択できます。ローカルでビルドする場合は、以下の手順に従ってください。ビルド済みイメージを使用するだけの場合は、[推論ガイド](inference.md)を直接参照してください。\n\n### 前提条件\n\n- DockerとDocker Composeがインストール済みであること\n- NVIDIA Dockerランタイムがインストール済みであること（GPUサポート用）\n- CUDAによる推論のために、少なくとも24GBのGPUメモリがあること\n\n### Docker Composeの使用\n\n開発やカスタマイズのために、Docker Composeを使用してローカルでビルド・実行できます:\n\n```bash\n# まず、リポジトリをクローンします\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# CUDAでWebUIを起動\ndocker compose --profile webui up\n\n# コンパイル最適化を有効にしてWebUIを起動\nCOMPILE=1 docker compose --profile webui up\n\n# APIサーバーを起動\ndocker compose --profile server up\n\n# コンパイル最適化を有効にしてAPIサーバーを起動\nCOMPILE=1 docker compose --profile server up\n\n# CPUのみでのデプロイ\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### Docker Compose 環境変数\n\n環境変数を使用してデプロイメントをカスタマイズできます:\n\n```bash\n# .env ファイルの例\nBACKEND=cuda              # または cpu\nCOMPILE=1                 # コンパイル最適化を有効化\nGRADIO_PORT=7860         # WebUIのポート\nAPI_PORT=8080            # APIサーバーのポート\nUV_VERSION=0.8.15        # UVパッケージマネージャーのバージョン\n```\n\nこのコマンドはイメージをビルドし、コンテナを実行します。WebUIには`http://localhost:7860`で、APIサーバーには`http://localhost:8080`でアクセスできます。\n\n### 手動でのDockerビルド\n\nビルドプロセスをカスタマイズしたい上級者向け:\n\n```bash\n# CUDAサポート付きのWebUIイメージをビルド\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# CUDAサポート付きのAPIサーバーイメージをビルド\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# CPUのみのイメージをビルド（マルチプラットフォーム対応）\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# 開発用イメージをビルド\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### ビルド引数\n\n- `BACKEND`: `cuda` または `cpu` (デフォルト: `cuda`)\n- `CUDA_VER`: CUDAバージョン (デフォルト: `12.6.0`)\n- `UV_EXTRA`: CUDA用のUV追加パッケージ (デフォルト: `cu126`)\n- `UBUNTU_VER`: Ubuntuバージョン (デフォルト: `24.04`)\n- `PY_VER`: Pythonバージョン (デフォルト: `3.12`)\n\n### ボリュームマウント\n\nどちらの方法でも、以下のディレクトリをマウントする必要があります:\n\n- `./checkpoints:/app/checkpoints` - モデルの重みファイル用ディレクトリ\n- `./references:/app/references` - 参照音声ファイル用ディレクトリ\n\n### 環境変数\n\n- `COMPILE=1` - `torch.compile`を有効にして推論を高速化（約10倍）\n- `GRADIO_SERVER_NAME=0.0.0.0` - WebUIサーバーのホスト\n- `GRADIO_SERVER_PORT=7860` - WebUIサーバーのポート\n- `API_SERVER_NAME=0.0.0.0` - APIサーバーのホスト\n- `API_SERVER_PORT=8080` - APIサーバーのポート\n\n!!! note\n    Dockerコンテナは、モデルの重みが`/app/checkpoints`にマウントされることを想定しています。コンテナを起動する前に、必要なモデルの重みをダウンロードしてください。\n\n!!! warning\n    GPUサポートにはNVIDIA Dockerランタイムが必要です。CPUのみでデプロイする場合は、`--gpus all`フラグを削除し、CPU用のイメージを使用してください。\n"
  },
  {
    "path": "docs/ko/finetune.md",
    "content": "# 미세 조정 (Fine-tuning)\n\n이 페이지를 열었다는 것은, 사전 훈련된 모델의 제로샷(zero-shot) 성능에 만족하지 못했다는 의미일 것입니다. 여러분의 데이터셋에서 더 나은 성능을 내도록 모델을 미세 조정하고 싶으실 겁니다.\n\n현재 버전에서는 'LLAMA' 부분만 미세 조정하면 됩니다.\n\n## LLAMA 미세 조정\n### 1. 데이터셋 준비\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\n데이터셋을 위 형식으로 변환하여 `data` 폴더 아래에 배치해야 합니다. 오디오 파일 확장자는 `.mp3`, `.wav` 또는 `.flac`일 수 있으며, 주석 파일 확장자는 `.lab`을 권장합니다.\n\n!!! info\n    `.lab` 주석 파일에는 오디오의 전사 텍스트만 포함하면 되며, 특별한 형식 요구사항은 없습니다. 예를 들어 `hi.mp3`의 내용이 \"안녕하세요, 안녕히 가세요.\"라면, `hi.lab` 파일에는 \"안녕하세요, 안녕히 가세요.\"라는 한 줄의 텍스트만 포함하면 됩니다.\n\n!!! warning\n    데이터셋에 음량 정규화를 적용하는 것이 좋습니다. 이를 위해 [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess)를 사용할 수 있습니다.\n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n### 2. 시맨틱 토큰 일괄 추출\n\nVQGAN 가중치를 다운로드했는지 확인하세요. 그렇지 않은 경우 다음 명령을 실행하세요.\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\n그런 다음 다음 명령을 실행하여 시맨틱 토큰을 추출할 수 있습니다.\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n```\n\n!!! note\n    `--num-workers`와 `--batch-size`를 조정하여 추출 속도를 높일 수 있지만, GPU 메모리 한도를 초과하지 않도록 주의하세요.\n\n이 명령은 `data` 디렉토리에 `.npy` 파일을 생성합니다. 결과는 다음과 같습니다.\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy\n```\n\n### 3. 데이터셋을 protobuf로 패킹하기\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\n명령 실행이 완료되면 `data` 디렉토리에서 `protos` 파일을 볼 수 있어야 합니다.\n\n### 4. 마지막으로, LoRA로 미세 조정하기\n\n마찬가지로, `LLAMA` 가중치를 다운로드했는지 확인하세요. 그렇지 않은 경우 다음 명령을 실행하세요.\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\n마지막으로, 다음 명령을 실행하여 미세 조정을 시작할 수 있습니다.\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note\n    `fish_speech/configs/text2semantic_finetune.yaml` 파일을 수정하여 `batch_size`, `gradient_accumulation_steps` 등 훈련 매개변수를 GPU 메모리에 맞게 조정할 수 있습니다.\n\n!!! note\n    Windows 사용자의 경우, `trainer.strategy.process_group_backend=gloo`를 사용하여 `nccl` 관련 문제를 피할 수 있습니다.\n\n훈련이 완료되면 [추론](inference.md) 섹션을 참조하여 모델을 테스트할 수 있습니다.\n\n!!! info\n    기본 설정에서는 모델이 화자의 발음 방식만 학습하고 음색은 학습하지 않습니다. 음색 안정성을 보장하려면 여전히 프롬프트를 사용해야 합니다.\n    음색을 학습시키고 싶다면 훈련 스텝 수를 늘리되, 이는 과적합(overfitting)으로 이어질 수 있습니다.\n\n훈련 후, 추론을 수행하기 전에 LoRA 가중치를 일반 가중치로 변환해야 합니다.\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/openaudio-s1-mini \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/openaudio-s1-mini-yth-lora/\n```\n\n!!! note\n    다른 체크포인트를 시도해 볼 수도 있습니다. 요구 사항을 충족하는 가장 이른 체크포인트를 사용하는 것이 좋습니다. 이러한 체크포인트는 보통 OOD(분포 외) 데이터에서 더 나은 성능을 보입니다.\n"
  },
  {
    "path": "docs/ko/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><a href=\"../en/\">English</a> | <a href=\"../zh/\">简体中文</a> | <a href=\"../pt/\">Portuguese</a> | <a href=\"../ja/\">日本語</a> | <strong>한국어</strong> | <a href=\"../ar/\">العربية</a></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"라이선스 공지\"\n    이 코드베이스 및 관련 모델 가중치는 **FISH AUDIO RESEARCH LICENSE** 하에 릴리스되었습니다. 자세한 내용은 [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE)를 참조하십시오.\n\n!!! warning \"법적 면책 조항\"\n    코드베이스의 불법적인 사용에 대해 당사는 어떠한 책임도 지지 않습니다. DMCA 및 기타 관련 법률에 관한 현지 규정을 참조하십시오.\n\n## 빠른 시작\n\n### 문서로 바로 시작하기\n\nFish Audio S2 공식 문서입니다. 아래 링크에서 바로 시작할 수 있습니다.\n\n- [설치](https://speech.fish.audio/ko/install/)\n- [커맨드라인 추론](https://speech.fish.audio/ko/inference/)\n- [WebUI 추론](https://speech.fish.audio/ko/inference/)\n- [서버 추론](https://speech.fish.audio/ko/server/)\n- [Docker 설정](https://speech.fish.audio/ko/install/)\n\n> [!IMPORTANT]\n> **SGLang 서버는 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md)를 참고하세요.**\n\n### LLM Agent 가이드\n\n```\nhttps://speech.fish.audio/ko/install/ 문서를 따라 Fish Audio S2를 설치하고 구성하세요.\n```\n\n## Fish Audio S2\n**오픈 소스와 클로즈드 소스 모두에서 가장 뛰어난 텍스트 음성 변환 시스템**\n\nFish Audio S2는 [Fish Audio](https://fish.audio/)가 개발한 최신 모델입니다. 약 50개 언어, 1,000만 시간 이상의 오디오 데이터로 학습되었고, 강화학습 정렬과 Dual-Autoregressive 아키텍처를 결합해 자연스럽고 사실적이며 감정 표현이 풍부한 음성을 생성합니다.\n\nS2는 `[laugh]`, `[whispers]`, `[super happy]` 같은 자연어 태그를 사용해 운율과 감정을 문장 내부에서 세밀하게 제어할 수 있으며, 멀티 화자/멀티 턴 생성도 네이티브로 지원합니다.\n\n실시간 데모는 [Fish Audio 웹사이트](https://fish.audio/)에서, 자세한 내용은 [블로그 글](https://fish.audio/blog/fish-audio-open-sources-s2/)과 [기술 보고서](https://arxiv.org/abs/2603.08823)에서 확인할 수 있습니다.\n\n### 모델 변형\n\n| 모델 | 크기 | 가용성 | 설명 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B 매개변수 | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 최고 수준의 품질과 안정성을 제공하는 풀기능 플래그십 모델 |\n\n모델 상세는 [기술 보고서](https://arxiv.org/abs/2411.01156)를 참고하세요.\n\n## 벤치마크 결과\n\n| 벤치마크 | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (중국어) | **0.54%** (전체 최고) |\n| Seed-TTS Eval — WER (영어) | **0.99%** (전체 최고) |\n| Audio Turing Test (지시 포함) | **0.515** 사후 평균 |\n| EmergentTTS-Eval — 승률 | **81.88%** (전체 최고) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 품질 | **4.51 / 5.0** |\n| 다국어 (MiniMax Testset) — 최고 WER | **24개 언어 중 11개** |\n| 다국어 (MiniMax Testset) — 최고 SIM | **24개 언어 중 17개** |\n\nSeed-TTS Eval에서 S2는 클로즈드 소스 시스템을 포함한 전체 비교 모델 중 가장 낮은 WER를 기록했습니다: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). Audio Turing Test에서는 0.515를 기록해 Seed-TTS (0.417) 대비 24%, MiniMax-Speech (0.387) 대비 33% 높았습니다. EmergentTTS-Eval에서는 파라언어 표현(91.61%), 의문문(84.41%), 구문 복잡도(83.39%)에서 특히 강한 성능을 보였습니다.\n\n## 주요 특징\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### 자연어 기반 세밀한 인라인 제어\n\nFish Audio S2는 텍스트의 특정 단어 또는 구문 위치에 자연어 지시를 직접 삽입해 음성 생성을 국소적으로 제어할 수 있습니다. 고정된 사전 정의 태그에 의존하는 대신, S2는 [whisper in small voice], [professional broadcast tone], [pitch up] 같은 자유 형식 텍스트 설명을 받아 단어 수준의 개방형 표현 제어를 지원합니다.\n\n### Dual-Autoregressive 아키텍처\n\nS2는 decoder-only Transformer와 RVQ 기반 오디오 코덱(10 codebooks, 약 21 Hz 프레임레이트)을 결합합니다. Dual-AR은 생성 과정을 두 단계로 나눕니다.\n\n- **Slow AR**: 시간축을 따라 동작하며 주 semantic codebook을 예측\n- **Fast AR**: 각 시점에서 나머지 9개 residual codebook을 생성해 세밀한 음향 디테일을 복원\n\n이 비대칭 설계(시간축 4B 파라미터, 깊이축 400M 파라미터)는 음질을 유지하면서 추론 효율을 높입니다.\n\n### 강화학습 정렬\n\nS2는 후학습 정렬을 위해 Group Relative Policy Optimization(GRPO)을 사용합니다. 학습 데이터 필터링/라벨링에 쓰인 동일한 모델을 RL 보상 모델로 재사용해, 사전학습 데이터 분포와 후학습 목표 간의 분포 불일치를 줄였습니다. 보상 신호는 의미 정확도, 지시 준수도, 음향 선호 점수, 음색 유사도를 함께 반영합니다.\n\n### SGLang 기반 프로덕션 스트리밍\n\nDual-AR 구조는 표준 자기회귀 LLM과 구조적으로 동형이기 때문에, S2는 SGLang의 LLM 서빙 최적화를 그대로 활용합니다. 예: continuous batching, paged KV cache, CUDA graph replay, RadixAttention 기반 prefix caching.\n\nNVIDIA H200 단일 GPU 기준:\n\n- **실시간 계수(RTF):** 0.195\n- **첫 오디오 출력까지 시간:** 약 100 ms\n- **처리량:** RTF 0.5 미만 유지 시 3,000+ acoustic tokens/s\n\n### 다국어 지원\n\nFish Audio S2는 음소나 언어별 전처리 없이 고품질 다국어 텍스트 음성 변환을 지원합니다. 포함 사항:\n\n**영어, 중국어, 일본어, 한국어, 아랍어, 독일어, 프랑스어...**\n\n**그리고 더 많이!**\n\n목록은 계속 확장되고 있습니다. 최신 릴리스는 [Fish Audio](https://fish.audio/)를 확인하세요.\n\n### 네이티브 멀티 화자 생성\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nFish Audio S2는 사용자가 여러 화자가 포함된 참조 오디오를 업로드할 수 있도록 하며, 모델은 `<|speaker:i|>` 토큰을 통해 각 화자의 특징을 처리합니다. 그런 다음 화자 ID 토큰으로 모델의 성능을 제어하여 한 번의 생성으로 여러 화자를 포함할 수 있습니다. 이전처럼 각 화자마다 별도로 참조 오디오를 업로드하고 음성을 생성할 필요가 없습니다.\n\n### 멀티 턴 대화 생성\n\n모델 컨텍스트의 확장 덕분에 이제 이전 정보를 활용하여 후속 생성 콘텐츠의 표현력을 높이고 콘텐츠의 자연스러움을 향상시킬 수 있습니다.\n\n### 빠른 음성 복제\n\nFish Audio S2는 짧은 참조 샘플(일반적으로 10-30초)을 사용하여 정확한 음성 복제를 지원합니다. 모델은 음색, 말하기 스타일 및 감정적 경향을 캡처하여 추가 미세 조정 없이 사실적이고 일관된 복제 음성을 생성합니다.\nSGLang 서버 사용은 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) 를 참고하세요.\n\n---\n\n## 크레딧\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 기술 보고서\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/ko/inference.md",
    "content": "# 추론\n\nFish Audio S2 모델은 큰 비디오 메모리(VRAM)가 필요합니다. 추론을 위해 최소 24GB 이상의 GPU를 사용하는 것을 권장합니다.\n\n## 가중치 다운로드\n\n먼저 모델 가중치를 다운로드해야 합니다:\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## 명령줄 추론\n\n!!! note\n    모델이 음색을 무작위로 선택하게 하려면 이 단계를 건너뛸 수 있습니다.\n\n### 1. 참조 오디오에서 VQ 토큰 가져오기\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\n`fake.npy`와 `fake.wav` 파일이 생성됩니다.\n\n### 2. 텍스트에서 Semantic 토큰 생성:\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"변환하려는 텍스트\" \\\n    --prompt-text \"참조 텍스트\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\n이 명령은 작업 디렉토리에 `codes_N` 파일을 생성합니다. 여기서 N은 0부터 시작하는 정수입니다.\n\n!!! note\n    더 빠른 추론을 위해 CUDA 커널을 병합하는 `--compile`을 사용하고 싶을 수 있지만, 당사의 sglang 추론 가속 최적화를 사용하는 것을 더 권장합니다.\n    마찬가지로 가속을 사용할 계획이 없다면 `--compile` 매개변수를 주석 처리할 수 있습니다.\n\n!!! info\n    bf16을 지원하지 않는 GPU의 경우 `--half` 매개변수를 사용해야 할 수 있습니다.\n\n### 3. 시맨틱 토큰에서 음성 생성:\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\n이후 `fake.wav` 파일을 얻게 됩니다.\n\n## WebUI 추론\n\n### 1. Gradio WebUI\n\n호환성을 유지하기 위해 기존의 Gradio WebUI를 보존하고 있습니다.\n\n```bash\npython tools/run_webui.py # 가속이 필요한 경우 --compile\n```\n\n### 2. Awesome WebUI\n\nAwesome WebUI는 TypeScript 기반으로 개발된 현대적인 웹 인터페이스로, 더 풍부한 기능과 향상된 사용자 경험을 제공합니다.\n\n**WebUI 빌드:**\n\n로컬 또는 서버에 Node.js와 npm이 설치되어 있어야 합니다.\n\n1. `awesome_webui` 디렉토리로 이동합니다:\n   ```bash\n   cd awesome_webui\n   ```\n2. 의존성 설치:\n   ```bash\n   npm install\n   ```\n3. WebUI 빌드:\n   ```bash\n   npm run build\n   ```\n\n**백엔드 서버 실행:**\n\nWebUI 빌드가 완료되면 프로젝트 루트로 돌아가 API 서버를 실행합니다:\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**접속:**\n\n서버가 실행된 후 브라우저를 통해 다음 주소로 접속하면 체험할 수 있습니다:\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/ko/install.md",
    "content": "## 요구 사양\n\n- GPU 메모리: 24GB (추론 시)\n- 시스템: Linux, WSL\n\n## 시스템 설정\n\nFish Audio S2는 다양한 설치 방법을 지원합니다. 자신의 개발 환경에 가장 적합한 방법을 선택하세요.\n\n**사전 요구사항**: 오디오 처리를 위한 시스템 의존성을 설치합니다:\n``` bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU 버전 설치 (CUDA 버전 선택: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# CPU 버전만 설치\npip install -e .[cpu]\n\n# 기본 설치 (PyTorch 기본 인덱스 사용)\npip install -e .\n\n# pyaudio 설치 중 오류가 발생하면 다음 명령을 사용해 보세요:\n# conda install pyaudio\n# 그런 다음 pip install -e . 를 다시 실행하세요\n```\n\n### UV\n\nUV는 더 빠른 의존성 해결 및 설치를 제공합니다:\n\n```bash\n# GPU 버전 설치 (CUDA 버전 선택: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# CPU 버전만 설치\nuv sync --python 3.12 --extra cpu\n```\n### Intel Arc XPU 지원\n\nIntel Arc GPU 사용자는 다음을 통해 XPU 지원을 설치하세요:\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# 필요한 C++ 표준 라이브러리 설치\nconda install libstdcxx -c conda-forge\n\n# Intel XPU를 지원하는 PyTorch 설치\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Fish Speech 설치\npip install -e .\n```\n\n!!! warning\n    `compile` 옵션은 Windows와 macOS에서 지원되지 않습니다. 컴파일을 활성화하여 실행하려면 Triton을 직접 설치해야 합니다.\n\n\n## Docker 설정\n\nFish Audio S2 시리즈 모델은 다양한 요구에 부응하기 위해 여러 Docker 배포 옵션을 제공합니다. Docker Hub의 사전 빌드된 이미지를 사용하거나, Docker Compose로 로컬에서 빌드하거나, 수동으로 사용자 정의 이미지를 빌드할 수 있습니다.\n\nWebUI와 API 서버 모두에 대해 GPU(기본값 CUDA 12.6) 및 CPU 버전의 Docker 이미지를 제공합니다. Docker Hub의 사전 빌드된 이미지를 사용하거나, Docker Compose로 로컬에서 빌드하거나, 수동으로 사용자 정의 이미지를 빌드할 수 있습니다. 로컬에서 빌드하려면 아래 지침을 따르세요. 사전 빌드된 이미지를 사용하려면 [추론 가이드](inference.md)를 직접 참조하세요.\n\n### 사전 요구사항\n\n- Docker 및 Docker Compose 설치\n- NVIDIA Docker 런타임 설치 (GPU 지원용)\n- CUDA 추론을 위한 최소 24GB의 GPU 메모리\n\n### Docker Compose 사용\n\n개발 또는 사용자 정의를 위해 Docker Compose를 사용하여 로컬에서 빌드하고 실행할 수 있습니다:\n\n```bash\n# 먼저 리포지토리를 클론합니다\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# CUDA로 WebUI 시작\ndocker compose --profile webui up\n\n# 컴파일 최적화로 WebUI 시작\nCOMPILE=1 docker compose --profile webui up\n\n# API 서버 시작\ndocker compose --profile server up\n\n# 컴파일 최적화로 API 서버 시작\nCOMPILE=1 docker compose --profile server up\n\n# CPU 전용 배포\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### Docker Compose 환경 변수\n\n환경 변수를 사용하여 배포를 사용자 정의할 수 있습니다:\n\n```bash\n# .env 파일 예시\nBACKEND=cuda              # 또는 cpu\nCOMPILE=1                 # 컴파일 최적화 활성화\nGRADIO_PORT=7860         # WebUI 포트\nAPI_PORT=8080            # API 서버 포트\nUV_VERSION=0.8.15        # UV 패키지 관리자 버전\n```\n\n이 명령은 이미지를 빌드하고 컨테이너를 실행합니다. WebUI는 `http://localhost:7860`에서, API 서버는 `http://localhost:8080`에서 접근할 수 있습니다.\n\n### 수동 Docker 빌드\n\n빌드 프로세스를 사용자 정의하려는 고급 사용자를 위해:\n\n```bash\n# CUDA를 지원하는 WebUI 이미지 빌드\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# CUDA를 지원하는 API 서버 이미지 빌드\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# CPU 전용 이미지 빌드 (멀티 플랫폼 지원)\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# 개발용 이미지 빌드\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### 빌드 인자\n\n- `BACKEND`: `cuda` 또는 `cpu` (기본값: `cuda`)\n- `CUDA_VER`: CUDA 버전 (기본값: `12.6.0`)\n- `UV_EXTRA`: CUDA용 UV 추가 패키지 (기본값: `cu126`)\n- `UBUNTU_VER`: Ubuntu 버전 (기본값: `24.04`)\n- `PY_VER`: Python 버전 (기본값: `3.12`)\n\n### 볼륨 마운트\n\n두 방법 모두 다음 디렉토리를 마운트해야 합니다:\n\n- `./checkpoints:/app/checkpoints` - 모델 가중치 디렉토리\n- `./references:/app/references` - 참조 오디오 파일 디렉토리\n\n### 환경 변수\n\n- `COMPILE=1` - `torch.compile`을 활성화하여 추론 속도 향상 (약 10배)\n- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI 서버 호스트\n- `GRADIO_SERVER_PORT=7860` - WebUI 서버 포트\n- `API_SERVER_NAME=0.0.0.0` - API 서버 호스트\n- `API_SERVER_PORT=8080` - API 서버 포트\n\n!!! note\n    Docker 컨테이너는 모델 가중치가 `/app/checkpoints`에 마운트될 것으로 예상합니다. 컨테이너를 시작하기 전에 필요한 모델 가중치를 다운로드했는지 확인하세요.\n\n!!! warning\n    GPU 지원에는 NVIDIA Docker 런타임이 필요합니다. CPU 전용 배포의 경우 `--gpus all` 플래그를 제거하고 CPU 이미지를 사용하세요.\n"
  },
  {
    "path": "docs/pt/finetune.md",
    "content": "# Ajuste Fino (Fine-tuning)\n\nObviamente, ao abrir esta página, você não estava satisfeito com o desempenho do modelo pré-treinado em modo zero-shot. Você deseja fazer um ajuste fino em um modelo para melhorar seu desempenho em seu conjunto de dados.\n\nNa versão atual, você só precisa fazer o ajuste fino da parte 'LLAMA'.\n\n## Ajuste Fino do LLAMA\n### 1. Prepare o conjunto de dados\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\nVocê precisa converter seu conjunto de dados para o formato acima e colocá-lo no diretório `data`. O arquivo de áudio pode ter as extensões `.mp3`, `.wav` ou `.flac`, e o arquivo de anotação deve ter a extensão `.lab`.\n\n!!! info\n    O arquivo de anotação `.lab` precisa conter apenas a transcrição do áudio, sem necessidade de formatação especial. Por exemplo, se `hi.mp3` contiver \"Olá, adeus.\", então o arquivo `hi.lab` conterá uma única linha de texto: \"Olá, adeus.\".\n\n!!! warning\n    Recomenda-se aplicar a normalização de volume (loudness) ao conjunto de dados. Você pode usar o [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) para fazer isso.\n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n### 2. Extração em lote de tokens semânticos\n\nCertifique-se de que você baixou os pesos do VQGAN. Se não, execute o seguinte comando:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nEm seguida, você pode executar o seguinte comando para extrair os tokens semânticos:\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/openaudio-s1-mini/codec.pth\"\n```\n\n!!! note\n    Você pode ajustar `--num-workers` e `--batch-size` para aumentar a velocidade de extração, mas certifique-se de não exceder o limite de memória da sua GPU.\n\nEste comando criará arquivos `.npy` no diretório `data`, como mostrado abaixo:\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy\n```\n\n### 3. Empacote o conjunto de dados em protobuf\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\nApós a conclusão da execução do comando, você deverá ver o arquivo `protos` no diretório `data`.\n\n### 4. Finalmente, ajuste fino com LoRA\n\nDa mesma forma, certifique-se de que você baixou os pesos do `LLAMA`. Se não, execute o seguinte comando:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\nFinalmente, você pode iniciar o ajuste fino executando o seguinte comando:\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note\n    Você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se adequar à memória da sua GPU, modificando `fish_speech/configs/text2semantic_finetune.yaml`.\n\n!!! note\n    Para usuários do Windows, você pode usar `trainer.strategy.process_group_backend=gloo` para evitar problemas com `nccl`.\n\nApós o treinamento ser concluído, você pode consultar a seção de [inferência](inference.md) para testar seu modelo.\n\n!!! info\n    Por padrão, o modelo aprenderá apenas os padrões de fala do locutor e não o timbre. Você ainda precisará usar prompts para garantir a estabilidade do timbre.\n    Se você quiser aprender o timbre, pode aumentar o número de passos de treinamento, mas isso pode levar a um sobreajuste (overfitting).\n\nApós o treinamento, você precisa converter os pesos do LoRA para pesos regulares antes de realizar a inferência.\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/openaudio-s1-mini \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/openaudio-s1-mini-yth-lora/```\n!!! note\n    Você também pode tentar outros checkpoints. Sugerimos usar o checkpoint mais antigo que atenda aos seus requisitos, pois eles geralmente têm um desempenho melhor em dados fora de distribuição (OOD).\n"
  },
  {
    "path": "docs/pt/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><a href=\"../en/\">English</a> | <a href=\"../zh/\">简体中文</a> | <strong>Portuguese</strong> | <a href=\"../ja/\">日本語</a> | <a href=\"../ko/\">한국어</a> | <a href=\"../ar/\">العربية</a></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"Aviso de Licença\"\n    Este repositório e todos os pesos de modelo associados são lançados sob a **FISH AUDIO RESEARCH LICENSE**. Consulte [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE) para mais detalhes.\n\n!!! warning \"Isenção de Responsabilidade Legal\"\n    Não nos responsabilizamos por qualquer uso ilegal da base de códigos. Consulte as regulamentações locais sobre DMCA e outras leis relacionadas.\n\n## Início Rápido\n\n### Comece pela documentação\n\nEsta é a documentação oficial do Fish Audio S2. Você pode começar por aqui:\n\n- [Instalação](https://speech.fish.audio/pt/install/)\n- [Inferência por Linha de Comando](https://speech.fish.audio/pt/inference/)\n- [Inferência WebUI](https://speech.fish.audio/pt/inference/)\n- [Inferência via Servidor](https://speech.fish.audio/pt/server/)\n- [Configuração Docker](https://speech.fish.audio/pt/install/)\n\n> [!IMPORTANT]\n> **Para servidor com SGLang, consulte o [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).**\n\n### Guia para agentes LLM\n\n```\nInstale e configure o Fish Audio S2 seguindo as instruções em https://speech.fish.audio/pt/install/ .\n```\n\n## Fish Audio S2\n**O melhor sistema de conversão de texto em fala entre código aberto e código fechado**\n\nO Fish Audio S2 é o modelo mais recente da [Fish Audio](https://fish.audio/). Treinado com mais de 10 milhões de horas de áudio em cerca de 50 idiomas, o S2 combina alinhamento por reforço com uma arquitetura Dual-Autoregressive para gerar fala natural, realista e emocionalmente expressiva.\n\nO S2 permite controle fino de prosódia e emoção dentro da própria frase com tags em linguagem natural, como `[laugh]`, `[whispers]` e `[super happy]`, além de oferecer suporte nativo a múltiplos falantes e múltiplos turnos.\n\nAcesVisite o [site da Fish Audio](https://fish.audio/) para demonstrações ao vivo. Leia a [postagem no blog](https://fish.audio/blog/fish-audio-open-sources-s2/) e o [relatório técnico](https://arxiv.org/abs/2603.08823) para mais detalhes.\n\n### Variantes do Modelo\n\n| Modelo | Tamanho | Disponibilidade | Descrição |\n|------|------|-------------|-------------|\n| S2-Pro | 4B parâmetros | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Modelo carro-chefe completo com máxima qualidade e estabilidade |\n\nMais detalhes podem ser encontrados no [relatório técnico](https://arxiv.org/abs/2411.01156).\n\n## Resultados de Benchmark\n\n| Benchmark | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER (Chinês) | **0.54%** (melhor geral) |\n| Seed-TTS Eval — WER (Inglês) | **0.99%** (melhor geral) |\n| Audio Turing Test (com instrução) | **0.515** média a posteriori |\n| EmergentTTS-Eval — Taxa de vitória | **81.88%** (maior geral) |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — Qualidade | **4.51 / 5.0** |\n| Multilíngue (MiniMax Testset) — Melhor WER | **11 de 24** idiomas |\n| Multilíngue (MiniMax Testset) — Melhor SIM | **17 de 24** idiomas |\n\nNo Seed-TTS Eval, o S2 obteve o menor WER entre todos os modelos avaliados, incluindo sistemas fechados: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90) e Seed-TTS (1.12/2.25). No Audio Turing Test, o valor 0.515 supera o Seed-TTS (0.417) em 24% e o MiniMax-Speech (0.387) em 33%. No EmergentTTS-Eval, o S2 se destacou especialmente em paralinguística (91.61%), perguntas (84.41%) e complexidade sintática (83.39%).\n\n## Destaques\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### Controle Inline Refinado via Linguagem Natural\n\nO Fish Audio S2 permite controle localizado da geração de fala ao incorporar instruções em linguagem natural diretamente em posições específicas de palavras ou frases no texto. Em vez de depender de um conjunto fixo de tags predefinidas, o S2 aceita descrições textuais livres, como [whisper in small voice], [professional broadcast tone] ou [pitch up], permitindo controle de expressão aberto no nível da palavra.\n\n### Arquitetura Dual-Autoregressive\n\nO S2 é baseado em um transformer apenas decodificador, combinado com um codec de áudio RVQ (10 codebooks, ~21 Hz de taxa de quadros). A arquitetura Dual-AR divide a geração em duas etapas:\n\n- **Slow AR** opera no eixo temporal e prevê o codebook semântico principal.\n- **Fast AR** gera os 9 codebooks residuais restantes em cada passo de tempo, reconstruindo detalhes acústicos finos.\n\nEsse desenho assimétrico (4B parâmetros no eixo temporal e 400M no eixo de profundidade) mantém a inferência eficiente sem sacrificar fidelidade de áudio.\n\n### Alinhamento por Reforço\n\nO S2 usa Group Relative Policy Optimization (GRPO) no pós-treinamento. Os mesmos modelos usados para filtrar e anotar dados de treino são reutilizados diretamente como modelos de recompensa no RL, eliminando o desalinhamento de distribuição entre os dados de pré-treinamento e os objetivos de pós-treinamento. O sinal de recompensa combina precisão semântica, aderência à instrução, preferência acústica e similaridade de timbre.\n\n### Streaming em Produção com SGLang\n\nComo a arquitetura Dual-AR é estruturalmente isomórfica a LLMs autoregressivos padrão, o S2 herda diretamente as otimizações nativas de serving do SGLang, incluindo continuous batching, paged KV cache, CUDA graph replay e prefix caching com RadixAttention.\n\nEm uma única NVIDIA H200:\n\n- **RTF (Real-Time Factor):** 0.195\n- **Tempo até o primeiro áudio:** ~100 ms\n- **Throughput:** mais de 3.000 acoustic tokens/s mantendo RTF abaixo de 0.5\n\n### Suporte Multilíngue\n\nO Fish Audio S2 oferece suporte a conversão de texto em fala multilíngue de alta qualidade sem a necessidade de fonemas ou processamento específico de idioma. Incluindo:\n\n**Inglês, Chinês, Japonês, Coreano, Árabe, Alemão, Francês...**\n\n**E MUITO MAIS!**\n\nA lista está em constante expansão, verifique o [Fish Audio](https://fish.audio/) para os lançamentos mais recentes.\n\n### Geração Nativa de Múltiplos Falantes\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nO Fish Audio S2 permite enviar um áudio de referência com vários falantes; o modelo processa as características de cada voz por meio do token `<|speaker:i|>`. Depois, você controla o comportamento do modelo com o token de ID do falante, permitindo incluir várias vozes em uma única geração. Assim, não é mais necessário subir um áudio de referência separado para cada falante.\n\n### Geração de Múltiplos Turnos\n\nGraças à extensão do contexto do modelo, nosso modelo agora pode usar informações anteriores para melhorar a expressividade e a naturalidade dos conteúdos gerados subsequentemente.\n\n### Clonagem de Voz Rápida\n\nO Fish Audio S2 suporta clonagem de voz precisa usando uma pequena amostra de referência (tipicamente de 10 a 30 segundos). O modelo captura o timbre, o estilo de fala e as tendências emocionais, produzindo vozes clonadas realistas e consistentes sem ajuste fino adicional.\nPara usar o servidor SGLang, consulte [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) .\n\n---\n\n## Créditos\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## Relatório Técnico\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/pt/inference.md",
    "content": "# Inferência\n\nO modelo Fish Audio S2 requer uma grande quantidade de VRAM. Recomendamos o uso de uma GPU com pelo menos 24GB para inferência.\n\n## Baixar Pesos\n\nPrimeiro, você precisa baixar os pesos do modelo:\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## Inferência por Linha de Comando\n\n!!! note\n    Se você planeja deixar o modelo escolher aleatoriamente um timbre de voz, pode pular esta etapa.\n\n### 1. Obter tokens VQ do áudio de referência\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\nVocê deve obter um `fake.npy` e um `fake.wav`.\n\n### 2. Gerar tokens Semânticos a partir do texto:\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"O texto que você deseja converter\" \\\n    --prompt-text \"Seu texto de referência\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\nEste comando criará um arquivo `codes_N` no diretório de trabalho, onde N é um número inteiro começando em 0.\n\n!!! note\n    Você pode querer usar `--compile` para fundir kernels CUDA para uma inferência mais rápida. No entanto, recomendamos usar nossa otimização de aceleração de inferência sglang.\n    Da mesma forma, se você não planeja usar aceleração, pode comentar o parâmetro `--compile`.\n\n!!! info\n    Para GPUs que não suportam bf16, você pode precisar usar o parâmetro `--half`.\n\n### 3. Gerar vocais a partir de tokens semânticos:\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\nDepois disso, você obterá um arquivo `fake.wav`.\n\n## Inferência WebUI\n\n### 1. Gradio WebUI\n\nPara manter a compatibilidade, mantemos a interface Gradio WebUI anterior.\n\n```bash\npython tools/run_webui.py # --compile se você precisar de aceleração\n```\n\n### 2. Awesome WebUI\n\nA Awesome WebUI é uma interface web moderna baseada em TypeScript, oferecendo funcionalidades mais ricas e uma melhor experiência do usuário.\n\n**Construir a WebUI:**\n\nVocê precisa ter o Node.js e o npm instalados em seu computador local ou servidor.\n\n1. Entre no diretório `awesome_webui`:\n   ```bash\n   cd awesome_webui\n   ```\n2. Instale as dependências:\n   ```bash\n   npm install\n   ```\n3. Construa a WebUI:\n   ```bash\n   npm run build\n   ```\n\n**Iniciar o Servidor Backend:**\n\nApós a construção da WebUI, retorne ao diretório raiz do projeto e inicie o servidor API:\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**Acesso:**\n\nApós o servidor ser iniciado, você pode acessá-lo através do navegador no seguinte endereço:\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/pt/install.md",
    "content": "## Requisitos\n\n- Memória da GPU: 24GB (Inferência)\n- Sistema: Linux, WSL\n\n## Configuração do Sistema\n\nO Fish Audio S2 suporta múltiplos métodos de instalação. Escolha o que melhor se adapta ao seu ambiente de desenvolvimento.\n\n**Pré-requisitos**: Instale as dependências de sistema para processamento de áudio:\n``` bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Instalação com GPU (escolha a sua versão do CUDA: cu126, cu128, cu129)\npip install -e .[cu129]\n\n# Instalação apenas para CPU\npip install -e .[cpu]\n\n# Instalação padrão (usa o índice padrão do PyTorch)\npip install -e .\n\n# Se encontrar um erro durante a instalação devido ao pyaudio, considere usar o seguinte comando:\n# conda install pyaudio\n# De seguida, execute pip install -e . novamente\n```\n\n### UV\n\nO UV oferece uma resolução e instalação de dependências mais rápidas:\n\n```bash\n# Instalação com GPU (escolha a sua versão do CUDA: cu126, cu128, cu129)\nuv sync --python 3.12 --extra cu129\n\n# Instalação apenas para CPU\nuv sync --python 3.12 --extra cpu\n```\n### Suporte para Intel Arc XPU\n\nPara utilizadores de GPUs Intel Arc, instale o suporte XPU da seguinte forma:\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# Instalar a biblioteca padrão C++ necessária\nconda install libstdcxx -c conda-forge\n\n# Instalar o PyTorch com suporte para Intel XPU\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# Instalar o Fish Speech\npip install -e .\n```\n\n!!! warning\n    A opção `compile` não é suportada no Windows e macOS. Se desejar executar com compilação, terá de instalar o Triton manualmente.\n\n\n## Configuração do Docker\n\nO modelo da série Fish Audio S2 oferece múltiplas opções de implementação com Docker para satisfazer diferentes necessidades. Pode usar imagens pré-construídas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas.\n\nFornecemos imagens Docker para a WebUI e o servidor API, tanto para GPU (CUDA 12.6 por defeito) como para CPU. Pode usar as imagens pré-construídas do Docker Hub, construir localmente com o Docker Compose, ou construir manualmente imagens personalizadas. Se quiser construir localmente, siga as instruções abaixo. Se apenas quiser usar as imagens pré-construídas, siga diretamente o [guia de inferência](inference.md).\n\n### Pré-requisitos\n\n- Docker e Docker Compose instalados\n- NVIDIA Docker runtime instalado (para suporte de GPU)\n- Pelo menos 24GB de memória de GPU para inferência com CUDA\n\n### Usar o Docker Compose\n\nPara desenvolvimento ou personalização, pode usar o Docker Compose para construir e executar localmente:\n\n```bash\n# Primeiro, clone o repositório\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# Iniciar a WebUI com CUDA\ndocker compose --profile webui up\n\n# Iniciar a WebUI com otimização de compilação\nCOMPILE=1 docker compose --profile webui up\n\n# Iniciar o servidor API\ndocker compose --profile server up\n\n# Iniciar o servidor API com otimização de compilação\nCOMPILE=1 docker compose --profile server up\n\n# Implementação apenas com CPU\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### Variáveis de Ambiente para o Docker Compose\n\nPode personalizar a implementação usando variáveis de ambiente:\n\n```bash\n# Exemplo de ficheiro .env\nBACKEND=cuda              # ou cpu\nCOMPILE=1                 # Ativar otimização de compilação\nGRADIO_PORT=7860         # Porta da WebUI\nAPI_PORT=8080            # Porta do servidor API\nUV_VERSION=0.8.15        # Versão do gestor de pacotes UV\n```\n\nO comando irá construir a imagem e executar o contentor. Pode aceder à WebUI em `http://localhost:7860` e ao servidor API em `http://localhost:8080`.\n\n### Construção Manual com Docker\n\nPara utilizadores avançados que desejam personalizar o processo de construção:\n\n```bash\n# Construir imagem da WebUI com suporte CUDA\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# Construir imagem do servidor API com suporte CUDA\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# Construir imagem apenas para CPU (suporta multiplataforma)\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# Construir imagem de desenvolvimento\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### Argumentos de Construção\n\n- `BACKEND`: `cuda` ou `cpu` (padrão: `cuda`)\n- `CUDA_VER`: Versão do CUDA (padrão: `12.6.0`)\n- `UV_EXTRA`: Pacote extra do UV para CUDA (padrão: `cu126`)\n- `UBUNTU_VER`: Versão do Ubuntu (padrão: `24.04`)\n- `PY_VER`: Versão do Python (padrão: `3.12`)\n\n### Montagem de Volumes\n\nAmbos os métodos requerem a montagem dos seguintes diretórios:\n\n- `./checkpoints:/app/checkpoints` - Diretório dos pesos do modelo\n- `./references:/app/references` - Diretório dos ficheiros de áudio de referência\n\n### Variáveis de Ambiente\n\n- `COMPILE=1` - Ativa o `torch.compile` para uma inferência mais rápida (cerca de 10x)\n- `GRADIO_SERVER_NAME=0.0.0.0` - Anfitrião do servidor WebUI\n- `GRADIO_SERVER_PORT=7860` - Porta do servidor WebUI\n- `API_SERVER_NAME=0.0.0.0` - Anfitrião do servidor API\n- `API_SERVER_PORT=8080` - Porta do servidor API\n\n!!! note\n    Os contentores Docker esperam que os pesos do modelo sejam montados em `/app/checkpoints`. Certifique-se de que descarregou os pesos do modelo necessários antes de iniciar os contentores.\n\n!!! warning\n    O suporte para GPU requer o NVIDIA Docker runtime. Para implementações apenas com CPU, remova a flag `--gpus all` e use as imagens de CPU.\n"
  },
  {
    "path": "docs/requirements.txt",
    "content": "mkdocs-material\nmkdocs-static-i18n[material]\nmkdocs[i18n]\n"
  },
  {
    "path": "docs/stylesheets/extra.css",
    "content": ".md-grid {\n  max-width: 1440px; \n}\n"
  },
  {
    "path": "docs/zh/finetune.md",
    "content": "# 微调\n\n显然, 当你打开这个页面的时候, 你已经对预训练模型 zero-shot 的效果不算满意. 你想要微调一个模型, 使得它在你的数据集上表现更好.  \n\n在目前版本，你只需要微调'LLAMA'部分即可.\n\n## LLAMA 微调\n### 1. 准备数据集\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 30.1-32.71.lab\n│   └── 30.1-32.71.mp3\n└── SPK2\n    ├── 38.79-40.85.lab\n    └── 38.79-40.85.mp3\n```\n\n你需要将数据集转为以上格式, 并放到 `data` 下, 音频后缀可以为 `.mp3`, `.wav` 或 `.flac`, 标注文件后缀建议为 `.lab`.\n\n!!! info\n    标注文件 `.lab` 仅需包含音频的转写文本，无需遵循特殊格式要求。例如，如果 `hi.mp3` 中的内容是“你好，再见。”，那么 `hi.lab` 文件中只需包含一行文本：“你好，再见”。    \n\n!!! warning\n    建议先对数据集进行响度匹配, 你可以使用 [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) 来完成这一步骤. \n    ```bash\n    fap loudness-norm data-raw data --clean\n    ```\n\n### 2. 批量提取语义 token\n\n确保你已经下载了 vqgan 权重, 如果没有, 请运行以下命令:\n\n```bash\nhuggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini\n```\n\n随后可运行以下命令来提取语义 token:\n\n```bash\npython tools/vqgan/extract_vq.py data \\\n    --num-workers 1 --batch-size 16 \\\n    --config-name \"modded_dac_vq\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\n!!! note\n    你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制.  \n\n该命令会在 `data` 目录下创建 `.npy` 文件, 如下所示:\n\n```\n.\n├── SPK1\n│   ├── 21.15-26.44.lab\n│   ├── 21.15-26.44.mp3\n│   ├── 21.15-26.44.npy\n│   ├── 27.51-29.98.lab\n│   ├── 27.51-29.98.mp3\n│   ├── 27.51-29.98.npy\n│   ├── 30.1-32.71.lab\n│   ├── 30.1-32.71.mp3\n│   └── 30.1-32.71.npy\n└── SPK2\n    ├── 38.79-40.85.lab\n    ├── 38.79-40.85.mp3\n    └── 38.79-40.85.npy\n```\n\n### 3. 打包数据集为 protobuf\n\n```bash\npython tools/llama/build_dataset.py \\\n    --input \"data\" \\\n    --output \"data/protos\" \\\n    --text-extension .lab \\\n    --num-workers 16\n```\n\n命令执行完毕后, 你应该能在 `data` 目录下看到 `protos` 文件.\n\n\n### 4. 最后, 使用 LoRA 进行微调\n\n同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:\n\n```bash\nhuggingface-cli download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n最后, 你可以运行以下命令来启动微调:\n\n```bash\npython fish_speech/train.py --config-name text2semantic_finetune \\\n    project=$project \\\n    +lora@model.model.lora_config=r_8_alpha_16\n```\n\n!!! note\n    你可以通过修改 `fish_speech/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存.\n\n!!! note\n    对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题.\n\n训练结束后, 你可以参考 [推理](inference.md) 部分来测试你的模型.\n\n!!! info\n    默认配置下, 基本只会学到说话人的发音方式, 而不包含音色, 你依然需要使用 prompt 来保证音色的稳定性.  \n    如果你想要学到音色, 请将训练步数调大, 但这有可能会导致过拟合. \n\n训练完成后, 你需要先将 loRA 的权重转为普通权重, 然后再进行推理.\n\n```bash\npython tools/llama/merge_lora.py \\\n\t--lora-config r_8_alpha_16 \\\n\t--base-weight checkpoints/s2-pro \\\n\t--lora-weight results/$project/checkpoints/step_000000010.ckpt \\\n\t--output checkpoints/s2-pro-yth-lora/\n```\n\n!!! note\n    你也可以尝试其他的 checkpoint, 我们建议你使用最早的满足你要求的 checkpoint, 他们通常在 OOD 上表现更好.\n"
  },
  {
    "path": "docs/zh/index.md",
    "content": "<div align=\"center\">\n<h1>Fish Speech</h1>\n\n<p><a href=\"../en/\">English</a> | <strong>简体中文</strong> | <a href=\"../pt/\">Portuguese</a> | <a href=\"../ja/\">日本語</a> | <a href=\"../ko/\">한국어</a> | <a href=\"../ar/\">العربية</a></p>\n\n<a href=\"https://www.producthunt.com/products/fish-speech?embed=true&utm_source=badge-top-post-badge&utm_medium=badge&utm_source=badge-fish&#0045;audio&#0045;s1\" target=\"_blank\"><img src=\"https://api.producthunt.com/widgets/embed-image/v1/top-post-badge.svg?post_id=1023740&theme=light&period=daily&t=1761164814710\" alt=\"Fish&#0032;Audio&#0032;S1 - Expressive&#0032;Voice&#0032;Cloning&#0032;and&#0032;Text&#0045;to&#0045;Speech | Product Hunt\" style=\"width: 250px; height: 54px;\" width=\"250\" height=\"54\" /></a>\n<a href=\"https://trendshift.io/repositories/7014\" target=\"_blank\">\n    <img src=\"https://trendshift.io/api/badge/repositories/7014\" alt=\"fishaudio%2Ffish-speech | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/>\n</a>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <img src=\"https://count.getloli.com/get/@fish-speech?theme=asoul\" /><br>\n</div>\n\n<br>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://discord.gg/Es5qTB9BcN\">\n        <img alt=\"Discord\" src=\"https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://hub.docker.com/r/fishaudio/fish-speech\">\n        <img alt=\"Docker\" src=\"https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://pd.qq.com/s/bwxia254o\">\n      <img alt=\"QQ Channel\" src=\"https://img.shields.io/badge/QQ-blue?logo=tencentqq\">\n    </a>\n</div>\n\n<div align=\"center\">\n    <a target=\"_blank\" href=\"https://huggingface.co/fishaudio/s2\">\n        <img alt=\"HuggingFace Model\" src=\"https://img.shields.io/badge/🤗%20-models-orange\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://fish.audio/blog/fish-audio-open-sources-s2/\">\n        <img alt=\"Fish Audio Blog\" src=\"https://img.shields.io/badge/Blog-Fish_Audio_S2-1f7a8c?style=flat-square&logo=readme&logoColor=white\"/>\n    </a>\n    <a target=\"_blank\" href=\"https://arxiv.org/abs/2603.08823\">\n        <img alt=\"Paper | Technical Report\" src=\"https://img.shields.io/badge/Paper-Technical_Report-b31b1b?style=flat-square\"/>\n    </a>\n</div>\n\n!!! info \"许可声明\"\n    此代码库及其相关的模型权重均在 **FISH AUDIO RESEARCH LICENSE** 下发布。更多详情请参考 [LICENSE](https://github.com/fishaudio/fish-speech/blob/main/LICENSE)。\n\n!!! warning \"法律免责声明\"\n    我们不对代码库的任何非法使用承担责任。请参考您当地关于 DMCA 和其他相关法律的法规。\n\n## 快速开始\n\n### 文档入口\n\n这里是 Fish Audio S2 的官方文档，请按照说明轻松入门。\n\n- [安装](https://speech.fish.audio/zh/install/)\n- [命令行推理](https://speech.fish.audio/zh/inference/)\n- [WebUI 推理](https://speech.fish.audio/zh/inference/)\n- [服务端推理](https://speech.fish.audio/zh/server/)\n- [Docker 部署](https://speech.fish.audio/zh/install/)\n\n> [!IMPORTANT]\n> **如需使用 SGLang Server，请参考 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md)。**\n\n### LLM Agent 指南\n\n```\n请先阅读 https://speech.fish.audio/zh/install/ ，并按文档安装和配置 Fish Audio S2。\n```\n\n## Fish Audio S2\n**在开源与闭源方案中都处于领先水平的文本转语音系统**\n\nFish Audio S2 是由 [Fish Audio](https://fish.audio/) 开发的最新模型。S2 在约 50 种语言、超过 1000 万小时音频数据上完成训练，并结合强化学习对齐与双自回归架构，能够生成自然、真实且情感丰富的语音。\n\nS2 支持通过自然语言标签（如 `[laugh]`、`[whispers]`、`[super happy]`）对韵律和情绪进行细粒度行内控制，同时原生支持多说话人和多轮生成。\n\n请访问 [Fish Audio 网站](https://fish.audio/) 体验在线演示，并阅读[博客文章](https://fish.audio/blog/fish-audio-open-sources-s2/)和[技术报告](https://arxiv.org/abs/2603.08823)了解更多细节。\n\n### 模型变体\n\n| 模型 | 大小 | 可用性 | 描述 |\n|------|------|-------------|-------------|\n| S2-Pro | 4B 参数 | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | 功能齐全的旗舰模型，具有最高质量和稳定性 |\n\n有关模型的更多详情，请参见[技术报告](https://arxiv.org/abs/2411.01156)。\n\n## 基准测试结果\n\n| 基准 | Fish Audio S2 |\n|------|------|\n| Seed-TTS Eval — WER（中文） | **0.54%**（总体最佳） |\n| Seed-TTS Eval — WER（英文） | **0.99%**（总体最佳） |\n| Audio Turing Test（含指令） | **0.515** 后验均值 |\n| EmergentTTS-Eval — 胜率 | **81.88%**（总体最高） |\n| Fish Instruction Benchmark — TAR | **93.3%** |\n| Fish Instruction Benchmark — 质量 | **4.51 / 5.0** |\n| 多语言（MiniMax Testset）— 最佳 WER | **24** 种语言中的 **11** 种 |\n| 多语言（MiniMax Testset）— 最佳 SIM | **24** 种语言中的 **17** 种 |\n\n在 Seed-TTS Eval 上，S2 在所有已评估模型（包括闭源系统）中实现了最低 WER：Qwen3-TTS（0.77/1.24）、MiniMax Speech-02（0.99/1.90）、Seed-TTS（1.12/2.25）。在 Audio Turing Test 上，S2 的 0.515 相比 Seed-TTS（0.417）提升 24%，相比 MiniMax-Speech（0.387）提升 33%。在 EmergentTTS-Eval 中，S2 在副语言学（91.61% 胜率）、疑问句（84.41%）和句法复杂度（83.39%）等维度表现尤为突出。\n\n## 亮点\n\n<img src=\"../assets/totalability.png\" width=200%>\n\n### 通过自然语言进行细粒度行内控制\n\nFish Audio S2 支持在文本中的特定词或短语位置直接嵌入自然语言指令，从而对语音生成进行局部控制。与依赖固定预设标签不同，S2 接受自由形式的文本描述，例如 [whisper in small voice]、[professional broadcast tone] 或 [pitch up]，实现词级别的开放式表达控制。\n\n### 双自回归架构（Dual-Autoregressive）\n\nS2 基于仅解码器 Transformer，并结合 RVQ 音频编解码器（10 个码本，约 21 Hz 帧率）。Dual-AR 架构将生成拆分为两个阶段：\n\n- **Slow AR** 沿时间轴运行，预测主语义码本。\n- **Fast AR** 在每个时间步生成剩余 9 个残差码本，用于重建细粒度声学细节。\n\n这种非对称设计（时间轴 4B 参数、深度轴 400M 参数）在保持音频保真度的同时，提高了推理效率。\n\n### 强化学习对齐\n\nS2 使用 Group Relative Policy Optimization（GRPO）进行后训练对齐。用于过滤和标注训练数据的同一批模型被直接复用为 RL 的奖励模型，从而避免了预训练数据分布与后训练目标之间的不匹配。奖励信号综合了语义准确性、指令遵循、声学偏好评分与音色相似度。\n\n### 基于 SGLang 的生产级流式推理\n\n由于 Dual-AR 架构在结构上与标准自回归 LLM 同构，S2 可以直接继承 SGLang 提供的 LLM 原生服务优化能力，包括连续批处理、分页 KV Cache、CUDA Graph Replay 与基于 RadixAttention 的前缀缓存。\n\n在单张 NVIDIA H200 GPU 上：\n\n- **实时因子（RTF）：** 0.195\n- **首音频延迟：** 约 100 ms\n- **吞吐：** 在 RTF 低于 0.5 的情况下达到 3,000+ acoustic tokens/s\n\n### 多语言支持\n\nFish Audio S2 支持高质量的多语言文本转语音，无需音素或特定语言的预处理。包括：\n\n**英语、中文、日语、韩语、阿拉伯语、德语、法语...**\n\n**以及更多！**\n\n列表正在不断扩大，请查看 [Fish Audio](https://fish.audio/) 获取最新发布。\n\n### 原生多说话人生成\n\n<img src=\"../assets/chattemplate.png\" width=200%>\n\nFish Audio S2 允许用户上传包含多个说话人的参考音频，模型将通过 `<|speaker:i|>` 令牌处理每个说话人的特征。之后您可以通过说话人 ID 令牌控制模型的表现，从而实现一次生成中包含多个说话人。再也不需要像以前那样针对每个说话人都单独上传参考音频与生成语音了。\n\n### 多轮对话生成\n\n得益于模型上下文的扩展，我们的模型现在可以借助上文的信息提高后续生成内容的表现力，从而提升内容的自然度。\n\n### 快速语音克隆\n\nFish Audio S2 支持使用短参考样本（通常为 10-30 秒）进行准确的语音克隆。模型可以捕捉音色、说话风格和情感倾向，无需额外微调即可生成逼真且一致的克隆语音。\n如需使用 SGLang Server，请参考 [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md) 。\n\n---\n\n## 致谢\n\n- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)\n- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)\n- [GPT VITS](https://github.com/innnky/gpt-vits)\n- [MQTTS](https://github.com/b04901014/MQTTS)\n- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)\n- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)\n- [Qwen3](https://github.com/QwenLM/Qwen3)\n\n## 技术报告\n\n```bibtex\n@misc{fish-speech-v1.4,\n      title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis},\n      author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing},\n      year={2024},\n      eprint={2411.01156},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2411.01156},\n}\n\n@misc{liao2026fishaudios2technical,\n      title={Fish Audio S2 Technical Report}, \n      author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han},\n      year={2026},\n      eprint={2603.08823},\n      archivePrefix={arXiv},\n      primaryClass={cs.SD},\n      url={https://arxiv.org/abs/2603.08823}, \n}\n```\n"
  },
  {
    "path": "docs/zh/inference.md",
    "content": "# 推理\n\nFish Audio S2 模型需要较大的显存，我们推荐您使用至少24GB的GPU进行推理。\n\n## 下载权重\n\n首先您需要下载模型权重：\n\n```bash\nhf download fishaudio/s2-pro --local-dir checkpoints/s2-pro\n```\n\n## 命令行推理\n\n!!! note\n    如果您计划让模型随机选择音色，可以跳过此步骤。\n\n### 1. 从参考音频获取 VQ tokens\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"test.wav\" \\\n    --checkpoint-path \"checkpoints/s2-pro/codec.pth\"\n```\n\n您应该会得到一个 `fake.npy` 和一个 `fake.wav`。\n\n### 2. 从文本生成 Semantic tokens：\n\n```bash\npython fish_speech/models/text2semantic/inference.py \\\n    --text \"您想要转换的文本\" \\\n    --prompt-text \"您的参考文本\" \\\n    --prompt-tokens \"fake.npy\" \\\n    # --compile\n```\n\n此命令将在工作目录中创建一个 `codes_N` 文件，其中 N 是从 0 开始的整数。\n\n!!! note\n    您可能希望使用 `--compile` 来融合 CUDA 内核以实现更快的推理，但是我们更推荐您使用我们sglang的推理加速优化。\n    相应地，如果您不计划使用加速，可以注释掉 `--compile` 参数。\n\n!!! info\n    对于不支持 bf16 的 GPU，您可能需要使用 `--half` 参数。\n\n### 3. 从语义令牌生成声音：\n\n```bash\npython fish_speech/models/dac/inference.py \\\n    -i \"codes_0.npy\" \\\n```\n\n之后你会得到一个fake.wav文件。\n\n## WebUI 推理\n\n### 1. Gradio WebUI\n\n为了保持兼容，我们保留了以往的Gradio WebUI。\n\n```bash\npython tools/run_webui.py # --compile 如果你需要加速的话\n```\n\n### 2. Awesome WebUI\n\nAwesome WebUI 是一个基于 TypeScript 开发的现代化 Web 界面，提供更丰富的功能和更好的交互体验。\n\n**构建 WebUI：**\n\n您需要先在本地或者服务器上安装 Node.js 和 npm。\n\n1. 进入 `awesome_webui` 目录：\n   ```bash\n   cd awesome_webui\n   ```\n2. 安装依赖：\n   ```bash\n   npm install\n   ```\n3. 构建 WebUI：\n   ```bash\n   npm run build\n   ```\n\n**启动后端服务器：**\n\nWebUI 构建完成后，返回项目根目录，启动 API 服务器：\n\n```bash\npython tools/api_server.py --listen 0.0.0.0:8888 --compile\n```\n\n**访问：**\n\n在服务器启动后，您可以通过浏览器访问以下地址体验：\n`http://localhost:8888/ui`\n"
  },
  {
    "path": "docs/zh/install.md",
    "content": "## 系统要求\n\n- GPU 显存：24GB（用于推理）\n- 系统：Linux、WSL\n\n## 系统设置\n\nFish Audio S2 支持多种安装方式。请选择最适合你当前开发环境的方案。\n\n**前置依赖**：先安装音频处理所需的系统依赖：\n```bash\napt install portaudio19-dev libsox-dev ffmpeg\n```\n\n### Conda\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# GPU 安装（选择 CUDA 版本：cu126、cu128、cu129）\npip install -e .[cu129]\n\n# 仅 CPU 安装\npip install -e .[cpu]\n\n# 默认安装（使用 PyTorch 默认索引）\npip install -e .\n\n# 如果因 pyaudio 导致安装报错，可以先执行：\n# conda install pyaudio\n# 然后重新执行 pip install -e .\n```\n\n### UV\n\nUV 可以更快地完成依赖解析与安装：\n\n```bash\n# GPU 安装（选择 CUDA 版本：cu126、cu128、cu129）\nuv sync --python 3.12 --extra cu129\n\n# 仅 CPU 安装\nuv sync --python 3.12 --extra cpu\n```\n\n### Intel Arc XPU 支持\n\n如果你使用 Intel Arc GPU，可按以下方式安装 XPU 支持：\n\n```bash\nconda create -n fish-speech python=3.12\nconda activate fish-speech\n\n# 安装必需的 C++ 标准库\nconda install libstdcxx -c conda-forge\n\n# 安装支持 Intel XPU 的 PyTorch\npip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu\n\n# 安装 Fish Speech\npip install -e .\n```\n\n!!! warning\n    `compile` 选项暂不支持 Windows 和 macOS。若你希望启用 compile，请手动安装 Triton。\n\n## Docker 设置\n\nFish Audio S2 系列模型提供多种 Docker 部署方式，适配不同场景。你可以直接使用 Docker Hub 预构建镜像，也可以用 Docker Compose 本地构建，或手动构建自定义镜像。\n\n我们提供 WebUI 与 API Server 的 GPU（默认 CUDA126）和 CPU 镜像。你可以直接用 Docker Hub 镜像，也可以在本地构建。如果你只想使用预构建镜像，请参考[inference guide](inference.md)。\n\n### 前置条件\n\n- 已安装 Docker 和 Docker Compose\n- （GPU 场景）已安装 NVIDIA Docker runtime\n- CUDA 推理建议至少 24GB 显存\n\n# 使用 Docker Compose\n\n如果你需要开发或自定义，推荐使用 Docker Compose 在本地构建并运行：\n\n```bash\n# 先克隆仓库\ngit clone https://github.com/fishaudio/fish-speech.git\ncd fish-speech\n\n# 使用 CUDA 启动 WebUI\ndocker compose --profile webui up\n\n# 启用 compile 优化启动 WebUI\nCOMPILE=1 docker compose --profile webui up\n\n# 启动 API Server\ndocker compose --profile server up\n\n# 启用 compile 优化启动 API Server\nCOMPILE=1 docker compose --profile server up\n\n# 仅 CPU 部署\nBACKEND=cpu docker compose --profile webui up\n```\n\n#### Docker Compose 环境变量\n\n你可以通过环境变量定制部署参数：\n\n```bash\n# .env 文件示例\nBACKEND=cuda              # 或 cpu\nCOMPILE=1                 # 启用 compile 优化\nGRADIO_PORT=7860          # WebUI 端口\nAPI_PORT=8080             # API Server 端口\nUV_VERSION=0.8.15         # UV 包管理器版本\n```\n\n命令执行后会自动构建镜像并启动容器。你可以通过 `http://localhost:7860` 访问 WebUI，通过 `http://localhost:8080` 访问 API Server。\n\n### 手动 Docker 构建\n\n如果你需要更细粒度的构建控制，可以手动构建：\n\n```bash\n# 构建支持 CUDA 的 WebUI 镜像\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target webui \\\n    -t fish-speech-webui:cuda .\n\n# 构建支持 CUDA 的 API Server 镜像\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --build-arg CUDA_VER=12.6.0 \\\n    --build-arg UV_EXTRA=cu126 \\\n    --target server \\\n    -t fish-speech-server:cuda .\n\n# 构建仅 CPU 镜像（支持多平台）\ndocker build \\\n    --platform linux/amd64,linux/arm64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cpu \\\n    --target webui \\\n    -t fish-speech-webui:cpu .\n\n# 构建开发镜像\ndocker build \\\n    --platform linux/amd64 \\\n    -f docker/Dockerfile \\\n    --build-arg BACKEND=cuda \\\n    --target dev \\\n    -t fish-speech-dev:cuda .\n```\n\n#### 构建参数\n\n- `BACKEND`：`cuda` 或 `cpu`（默认：`cuda`）\n- `CUDA_VER`：CUDA 版本（默认：`12.6.0`）\n- `UV_EXTRA`：UV 的 CUDA 扩展（默认：`cu126`）\n- `UBUNTU_VER`：Ubuntu 版本（默认：`24.04`）\n- `PY_VER`：Python 版本（默认：`3.12`）\n\n### 卷挂载\n\n两种方法都需要挂载以下目录：\n\n- `./checkpoints:/app/checkpoints` - 模型权重目录\n- `./references:/app/references` - 参考音频目录\n\n### 环境变量\n\n- `COMPILE=1` - 启用 `torch.compile`，可提升推理速度（约 10 倍）\n- `GRADIO_SERVER_NAME=0.0.0.0` - WebUI 服务地址\n- `GRADIO_SERVER_PORT=7860` - WebUI 服务端口\n- `API_SERVER_NAME=0.0.0.0` - API 服务地址\n- `API_SERVER_PORT=8080` - API 服务端口\n\n!!! note\n    Docker 容器默认从 `/app/checkpoints` 读取模型权重。启动容器前请先下载好所需权重。\n\n!!! warning\n    GPU 支持需要 NVIDIA Docker runtime。若仅使用 CPU，请移除 `--gpus all` 并使用 CPU 镜像。\n"
  },
  {
    "path": "entrypoint.sh",
    "content": "#!/bin/bash\n\nCUDA_ENABLED=${CUDA_ENABLED:-true}\nDEVICE=\"\"\n\nif [ \"${CUDA_ENABLED}\" != \"true\" ]; then\n    DEVICE=\"--device cpu\"\nfi\n\nexec python tools/run_webui.py ${DEVICE}\n"
  },
  {
    "path": "fish_speech/callbacks/__init__.py",
    "content": "from .grad_norm import GradNormMonitor\n\n__all__ = [\"GradNormMonitor\"]\n"
  },
  {
    "path": "fish_speech/callbacks/grad_norm.py",
    "content": "from typing import Optional, Union\n\nimport lightning.pytorch as pl\nimport torch\nfrom lightning import LightningModule, Trainer\nfrom lightning.pytorch.callbacks import Callback\nfrom torch import Tensor, nn\nfrom torch.utils._foreach_utils import (\n    _group_tensors_by_device_and_dtype,\n    _has_foreach_support,\n)\n\n\n@torch.no_grad()\ndef grad_norm(\n    parameters: Union[Tensor, list[Tensor]],\n    norm_type: float = 2.0,\n) -> float:\n    \"\"\"\n    Returns the norm of the gradients of the given parameters.\n\n    Args:\n        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a\n            single Tensor that will have gradients normalized\n        norm_type (float): type of the used p-norm.\n\n    Returns:\n        Total norm of the parameter gradients (viewed as a single vector).\n    \"\"\"  # noqa: E501\n\n    if isinstance(parameters, Tensor):\n        parameters = [parameters]\n\n    grads = [p.grad for p in parameters if p.grad is not None]\n    if len(grads) == 0:\n        return None\n\n    first_device = grads[0].device\n    grouped_grads: dict[\n        tuple[torch.device, torch.dtype], list[list[Tensor]]\n    ] = _group_tensors_by_device_and_dtype(\n        [[g.detach() for g in grads]]\n    )  # type: ignore[assignment]\n\n    norms = []\n    for (device, _), ([grads], _) in grouped_grads.items():\n        if _has_foreach_support(grads, device=device):\n            norms.extend(torch._foreach_norm(grads, norm_type))\n        else:\n            norms.extend([torch.norm(g, norm_type) for g in grads])\n\n    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)\n\n\nclass GradNormMonitor(Callback):\n    \"\"\"\n    Callback that computes the gradient norm of the model parameters.\n    \"\"\"\n\n    def __init__(\n        self,\n        norm_type: float = 2.0,\n        logging_interval: str = \"step\",\n        sub_module: Optional[Union[str, list[str]]] = None,\n    ) -> None:\n        \"\"\"\n        Args:\n            norm_type (float): type of the used p-norm.\n            logging_interval (str): \"step\" or \"epoch\".\n        \"\"\"\n        super().__init__()\n\n        self.norm_type = norm_type\n        self.logging_interval = logging_interval\n        self.sub_module = sub_module\n\n    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:\n        \"\"\"\n        Computes the gradient norm of the model parameters and logs it to the logger.\n\n        Args:\n            trainer (Trainer): The trainer object\n            model (LightningModule): The current lightningModule\n        \"\"\"\n\n        lightning_model = model\n\n        if self.sub_module is None:\n            return self.log_sub_module_grad_norm(lightning_model, model, \"\")\n\n        sub_modules = self.sub_module\n        if isinstance(sub_modules, str):\n            sub_modules = [sub_modules]\n\n        for sub_module in sub_modules:\n            self.log_sub_module_grad_norm(\n                lightning_model, getattr(model, sub_module), f\"/{sub_module}\"\n            )\n\n    def log_sub_module_grad_norm(\n        self, lightning_model: LightningModule, model: nn.Module, path: str\n    ) -> None:\n        grad_norm_val = grad_norm(model.parameters(), self.norm_type)\n        if grad_norm_val is None:\n            return\n\n        on_step = self.logging_interval == \"step\"\n        lightning_model.log(\n            f\"train{path}/grad_norm\",\n            grad_norm_val,\n            on_step=on_step,\n            on_epoch=not on_step,\n        )\n"
  },
  {
    "path": "fish_speech/configs/base.yaml",
    "content": "# Base configuration for training a model\npaths:\n  run_dir: results/${project}\n  ckpt_dir: ${paths.run_dir}/checkpoints\n\nhydra:\n  run:\n    dir: ${paths.run_dir}\n\n# Lightning Trainer\ntrainer:\n  _target_: lightning.pytorch.trainer.Trainer\n\n  default_root_dir: ${paths.run_dir}\n  accelerator: gpu\n  num_nodes: 1\n  devices: auto\n  strategy:\n    _target_: lightning.pytorch.strategies.DDPStrategy\n    process_group_backend: nccl  # This should be override when training on windows\n\n  precision: bf16-mixed\n\n  # disable validation by epoch end\n  check_val_every_n_epoch: null\n  val_check_interval: 5000\n  max_steps: 100_000\n\n  # Use torch.backends.cudnn.benchmark to speed up training\n  benchmark: true\n\n# Callbacks\ncallbacks:\n  model_checkpoint:\n    _target_: lightning.pytorch.callbacks.ModelCheckpoint\n    dirpath: ${paths.ckpt_dir}\n    filename: \"step_{step:09d}\"\n    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt\n    save_top_k: 5 # save 5 latest checkpoints\n    monitor: step # use step to monitor checkpoints\n    mode: max # save the latest checkpoint with the highest global_step\n    every_n_epochs: null # don't save checkpoints by epoch end\n    every_n_train_steps: 5000 # save checkpoints every 5000 steps\n    auto_insert_metric_name: false\n\n  model_summary:\n    _target_: lightning.pytorch.callbacks.ModelSummary\n    max_depth: 2 # the maximum depth of layer nesting that the summary will include\n\n  learning_rate_monitor:\n    _target_: lightning.pytorch.callbacks.LearningRateMonitor\n    logging_interval: step\n    log_momentum: false\n\n  grad_norm_monitor:\n    _target_: fish_speech.callbacks.GradNormMonitor\n    norm_type: 2\n    logging_interval: step\n\n# Logger\nlogger:\n  tensorboard:\n    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger\n    save_dir: \"${paths.run_dir}/tensorboard/\"\n    name: null\n    log_graph: false\n    default_hp_metric: true\n    prefix: \"\"\n\n  # wandb:\n  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger\n  #   # name: \"\" # name of the run (normally generated by wandb)\n  #   save_dir: \"${paths.run_dir}\"\n  #   offline: False\n  #   id: null # pass correct id to resume experiment!\n  #   anonymous: null # enable anonymous logging\n  #   project: \"fish-speech\"\n  #   log_model: False # upload lightning ckpts\n  #   prefix: \"\" # a string to put at the beginning of metric keys\n  #   # entity: \"\" # set to name of your wandb team\n  #   group: \"\"\n  #   tags: [\"vq\", \"hq\", \"finetune\"]\n  #   job_type: \"\"\n    \n# Loop\ntrain: true\ntest: false\n"
  },
  {
    "path": "fish_speech/configs/lora/r_8_alpha_16.yaml",
    "content": "_target_: fish_speech.models.text2semantic.lora.LoraConfig\nr: 8\nlora_alpha: 16\nlora_dropout: 0.01\n"
  },
  {
    "path": "fish_speech/configs/modded_dac_vq.yaml",
    "content": "_target_: fish_speech.models.dac.modded_dac.DAC\n# Model setup\nsample_rate: 44100\nencoder_dim: 64\nencoder_rates: [2, 4, 8, 8]\ndecoder_dim: 1536\ndecoder_rates: [8, 8, 4, 2]\nencoder_transformer_layers: [0, 0, 0, 4]\ndecoder_transformer_layers: [4, 0, 0, 0]\ntransformer_general_config:\n  _target_: fish_speech.models.dac.modded_dac.ModelArgs\n  _partial_: true\n  block_size: 8192\n  n_local_heads: -1\n  head_dim: 64\n  rope_base: 10000\n  norm_eps: 1e-5\n  dropout_rate: 0.1\n  attn_dropout_rate: 0.1\n  channels_first: true\n# Quantization\nquantizer:\n  _target_: fish_speech.models.dac.rvq.DownsampleResidualVectorQuantize\n  input_dim: 1024\n  n_codebooks: 9\n  codebook_size: 1024\n  codebook_dim: 8\n  quantizer_dropout: 0.5\n  downsample_factor: [2, 2]\n  post_module: &transformer_module\n    _target_: fish_speech.models.dac.modded_dac.WindowLimitedTransformer\n    causal: true\n    window_size: 128  # empirically this does not seem to matter\n    input_dim: 1024\n    config: &transformer_config\n      _target_: fish_speech.models.dac.modded_dac.ModelArgs\n      block_size: 2048\n      n_layer: 8\n      n_head: 16\n      dim: 1024\n      intermediate_size: 3072\n      n_local_heads: -1\n      head_dim: 64\n      rope_base: 10000\n      norm_eps: 1e-5\n      dropout_rate: 0.1\n      attn_dropout_rate: 0.1\n      channels_first: true\n  pre_module: *transformer_module\n  semantic_codebook_size: 4096\n"
  },
  {
    "path": "fish_speech/configs/text2semantic_finetune.yaml",
    "content": "defaults:\n  - base\n  - _self_\n\nproject: text2semantic_finetune_dual_ar\nmax_length: 4096\npretrained_ckpt_path: checkpoints/openaudio-s1-mini\n\n# Lightning Trainer\ntrainer:\n  accumulate_grad_batches: 1\n  gradient_clip_val: 1.0\n  gradient_clip_algorithm: \"norm\"\n  max_steps: 10000\n  precision: bf16-true\n  limit_val_batches: 10\n  val_check_interval: 100\n  # strategy:\n  #   find_unused_parameters: true\n  #   static_graph: true \n\n# Dataset Configuration\ntokenizer:\n  _target_: fish_speech.tokenizer.FishTokenizer\n  model_path: ${pretrained_ckpt_path}/tokenizer.tiktoken\n\n# Dataset Configuration\ntrain_dataset:\n  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset\n  proto_files:\n    - data/protos\n  tokenizer: ${tokenizer}\n  causal: true\n  max_length: ${max_length}\n  use_speaker: false\n  interactive_prob: 0.7\n\nval_dataset:\n  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset\n  proto_files:\n    - data/protos\n  tokenizer: ${tokenizer}\n  causal: true\n  max_length: ${max_length}\n  use_speaker: false\n  interactive_prob: 0.7\n\ndata:\n  _target_: fish_speech.datasets.semantic.SemanticDataModule\n  train_dataset: ${train_dataset}\n  val_dataset: ${val_dataset}\n  num_workers: 4\n  batch_size: 4\n  tokenizer: ${tokenizer}\n  max_length: ${max_length}\n\n# Model Configuration\nmodel:\n  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic\n  model: \n    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained\n    path: ${pretrained_ckpt_path}\n    load_weights: true\n    max_length: ${max_length}\n    lora_config: null\n\n  optimizer:\n    _target_: torch.optim.AdamW\n    _partial_: true\n    lr: 1e-4\n    weight_decay: 0\n    betas: [0.9, 0.95]\n    eps: 1e-5\n\n  lr_scheduler:\n    _target_: torch.optim.lr_scheduler.LambdaLR\n    _partial_: true\n    lr_lambda:\n      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda\n      _partial_: true\n      num_warmup_steps: 10\n\n# Callbacks\ncallbacks:\n  model_checkpoint:\n    every_n_train_steps: ${trainer.val_check_interval}\n"
  },
  {
    "path": "fish_speech/content_sequence.py",
    "content": "from dataclasses import dataclass, field\nfrom typing import List, Literal, Union\n\nimport numpy as np\nimport torch\n\nfrom fish_speech.tokenizer import (\n    IM_END_TOKEN,\n    MODALITY_TOKENS,\n    FishTokenizer,\n)\n\n\ndef restore_ndarray(obj, to_tensor: bool = False):\n    if isinstance(obj, dict) and \"__ndarray__\" in obj:\n        obj = np.frombuffer(obj[\"data\"], dtype=obj[\"dtype\"]).reshape(obj[\"shape\"])\n\n    if to_tensor and isinstance(obj, np.ndarray):\n        obj = torch.from_numpy(obj.copy())\n\n    return obj\n\n\n@dataclass\nclass BasePart:\n    type: Literal[\"text\", \"vq\", \"audio\"] | None = None\n    cal_loss: bool = False\n\n\n@dataclass(kw_only=True)\nclass VQPart(BasePart):\n    type = \"vq\"\n    codes: torch.Tensor\n\n    def __post_init__(self: \"VQPart\"):\n        self.type = \"vq\"\n        self.codes = restore_ndarray(self.codes, to_tensor=True)\n\n\n@dataclass(kw_only=True)\nclass TextPart(BasePart):\n    type = \"text\"\n    text: str | None = None\n    tokens: list[int] | None = None\n\n    def __post_init__(self: \"TextPart\"):\n        self.type = \"text\"\n        if self.text is None and self.tokens is None:\n            raise ValueError(\"Either text or tokens must be provided\")\n\n\n@dataclass(kw_only=True)\nclass AudioPart(BasePart):\n    type = \"audio\"\n    features: torch.Tensor\n\n    def __post_init__(self: \"AudioPart\"):\n        self.type = \"audio\"\n        self.features = restore_ndarray(self.features, to_tensor=True)\n\n\n@dataclass(kw_only=True)\nclass EncodedMessage:\n    tokens: torch.Tensor\n    labels: torch.Tensor\n    vq_mask_tokens: torch.Tensor | None = None\n    vq_mask_labels: torch.Tensor | None = None\n    vq_parts: list[torch.Tensor]\n    vq_require_losses: torch.Tensor | None = None\n    audio_parts: list[torch.Tensor]\n    audio_masks: torch.Tensor | None = None\n    metadata: dict | None = None\n\n\n@dataclass\nclass ContentSequence:\n    \"\"\"\n    Flexible sequence of content parts that supports interleaved multimodal format.\n    Example format: <|interleave|><|speaker:1|> TEXT AUDIO <|im_end|><|speaker:2|> TEXT AUDIO <|im_end|>\n    \"\"\"\n\n    parts: list[BasePart] = field(default_factory=list)\n    modality: Literal[\"text\", \"voice\", \"interleave\"] | None = None\n    metadata: dict | None = None\n\n    def __init__(\n        self: \"ContentSequence\",\n        parts: list[BasePart | dict] | None = None,\n        modality: Literal[\"text\", \"voice\", \"interleave\"] | None = None,\n        metadata: dict | None = None,\n    ):\n        self.modality = modality\n        self.metadata = metadata or {}\n\n        fixed_parts = []\n        for part in parts or []:\n            if isinstance(part, dict):\n                if part[\"type\"] == \"vq\":\n                    part = VQPart(**part)\n                elif part[\"type\"] == \"audio\":\n                    part = AudioPart(**part)\n                elif part[\"type\"] == \"text\":\n                    part = TextPart(**part)\n                else:\n                    raise ValueError(f\"Unsupported part type: {part['type']}\")\n            fixed_parts.append(part)\n\n        self.parts = fixed_parts\n\n        # If modality is specified, add it at the beginning if it's not already there\n        if self.modality and not (\n            len(self.parts) > 0\n            and isinstance(self.parts[0], dict) is False\n            and isinstance(self.parts[0], TextPart)\n            and self.parts[0].text is not None\n            and self.parts[0].text.startswith(MODALITY_TOKENS[self.modality])\n        ):\n            modality_token = MODALITY_TOKENS[self.modality]\n            self.parts.insert(0, TextPart(text=modality_token))\n\n    def append(\n        self: \"ContentSequence\",\n        part_or_parts: Union[BasePart, List[BasePart]],\n        add_end: bool = False,\n        speaker: Union[str, int] | None = None,\n    ):\n        \"\"\"\n        Append a part or list of parts to the sequence.\n\n        Args:\n            part_or_parts: A single part or list of parts to add\n            add_end: Whether to add the IM_END_TOKEN after these parts\n            speaker: Optional speaker identifier (name or ID) to add before the parts\n        \"\"\"\n        # Convert single part to list\n        parts_to_add = (\n            [part_or_parts] if not isinstance(part_or_parts, list) else part_or_parts\n        )\n\n        # Add speaker token if specified\n        if speaker is not None:\n            speaker_token = f\"<|speaker:{speaker}|>\"\n            self.parts.append(TextPart(text=speaker_token))\n\n        # Add all the parts\n        self.parts.extend(parts_to_add)\n\n        # Add end token if requested\n        if add_end:\n            self.parts.append(\n                TextPart(text=IM_END_TOKEN, cal_loss=self.parts[-1].cal_loss)\n            )\n\n    def encode(\n        self: \"ContentSequence\",\n        tokenizer: FishTokenizer,\n        add_shift: bool = True,\n        ignore_loss_tokens: list[str] = [],\n    ) -> EncodedMessage:\n        \"\"\"\n        Encode the sequence parts into tokens for the model.\n\n        Args:\n            tokenizer: The tokenizer to use\n            add_shift: Whether to shift tokens for next-token prediction\n            ignore_loss_tokens: List of token strings to ignore when calculating loss\n\n        Returns:\n            EncodedMessage with tensors ready for the model\n        \"\"\"\n        all_tokens = []\n        all_labels = []\n\n        # Multi-modal elements\n        vq_parts = []\n        vq_masks = []\n        vq_require_losses = []\n\n        audio_parts = []\n        audio_masks = []\n\n        # Optimization: Batch conversion for ignore tokens\n        ignore_loss_token_ids = []\n        if ignore_loss_tokens:\n            # Use the wrapper method which uses convert_tokens_to_ids\n            ignore_loss_token_ids = [\n                tokenizer.get_token_id(i) for i in ignore_loss_tokens\n            ]\n\n        for part in self.parts:\n            if isinstance(part, TextPart):\n                if part.tokens is None:\n                    assert part.text is not None\n                    # Optimization: Explicitly disable special tokens (BOS/EOS)\n                    # because we are constructing the sequence manually\n                    tokens = tokenizer.encode(part.text, add_special_tokens=False)\n                else:\n                    tokens = part.tokens\n\n                tokens = torch.tensor(tokens, dtype=torch.long)\n            elif isinstance(part, VQPart):\n                # Critical Optimization: Vectorized mapping\n                # Instead of loop lookup: [tokenizer.semantic_id_to_token_id[i] for i in codes]\n                # We use arithmetic offset: code + semantic_begin_id\n                # This assumes semantic tokens are contiguous in the vocab (DualAR requirement)\n                curr_codes = part.codes.clone().to(torch.int)\n\n                # Use int64 (long) for token IDs to avoid overflow or type mismatch in embedding\n                tokens = (curr_codes[0] + tokenizer.semantic_begin_id).to(torch.long)\n\n                vq_parts.append(curr_codes)\n                vq_require_losses.append(part.cal_loss)\n            else:\n                raise ValueError(f\"Unsupported part type: {type(part)}\")\n\n            all_tokens.append(tokens)\n\n            # Set masks for different part types\n            if isinstance(part, VQPart):\n                vq_masks.append(torch.ones_like(tokens, dtype=torch.bool))\n                audio_masks.append(torch.zeros_like(tokens, dtype=torch.bool))\n            elif isinstance(part, AudioPart):\n                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))\n                audio_mask = torch.ones_like(tokens, dtype=torch.bool)\n                audio_mask[0] = False  # Skip start token\n                audio_mask[-1] = False  # Skip end token\n                audio_masks.append(audio_mask)\n            else:\n                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))\n                audio_masks.append(torch.zeros_like(tokens, dtype=torch.bool))\n\n            # Set labels based on whether we want to calculate loss for this part\n            if part.cal_loss and not isinstance(part, AudioPart):\n                all_labels.append(tokens.clone())\n            else:\n                all_labels.append(torch.full_like(tokens, -100))\n\n        # Concatenate all tensors\n        if not all_tokens:\n            # Handle empty case safely\n            tokens = torch.empty(0, dtype=torch.long)\n            labels = torch.empty(0, dtype=torch.long)\n            vq_masks = torch.empty(0, dtype=torch.bool)\n            audio_masks = torch.empty(0, dtype=torch.bool)\n        else:\n            tokens = torch.cat(all_tokens, dim=0)\n            labels = torch.cat(all_labels, dim=0)\n            vq_masks = torch.cat(vq_masks, dim=0)\n            audio_masks = torch.cat(audio_masks, dim=0)\n\n        vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)\n\n        # Apply shift if needed for next-token prediction\n        vq_mask_tokens = vq_masks\n        vq_mask_labels = vq_masks\n\n        if add_shift and len(tokens) > 0:\n            tokens = tokens[:-1]\n            labels = labels[1:]\n            vq_masks = vq_masks[:-1]\n            vq_mask_tokens = vq_mask_tokens[:-1]\n            vq_mask_labels = vq_mask_labels[1:]\n            audio_masks = audio_masks[:-1]\n\n        # Ignore specified tokens\n        for i in ignore_loss_token_ids:\n            if i is not None:\n                labels[labels == i] = -100\n\n        return EncodedMessage(\n            tokens=tokens,\n            labels=labels,\n            vq_parts=vq_parts,\n            vq_mask_tokens=vq_mask_tokens,\n            vq_mask_labels=vq_mask_labels,\n            vq_require_losses=vq_require_losses,\n            audio_parts=audio_parts,\n            audio_masks=audio_masks,\n            metadata=self.metadata,\n        )\n\n    def encode_for_inference(\n        self: \"ContentSequence\",\n        tokenizer: FishTokenizer,\n        num_codebooks: int,\n    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:\n        encoded = self.encode(tokenizer, add_shift=False)\n        tokens = encoded.tokens\n        # Use int32 for prompt cache to save memory, convert to model dtype later if needed\n        # Or keep as input_ids (long)\n        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.long)\n        values[0] = tokens\n\n        if (encoded.vq_parts is None or len(encoded.vq_parts) == 0) and (\n            encoded.audio_parts is None or len(encoded.audio_parts) == 0\n        ):\n            return values, None, None\n\n        audio_parts = None\n        audio_masks = None\n\n        if encoded.vq_parts is not None and len(encoded.vq_parts) > 0:\n            vq_parts = encoded.vq_parts\n            # List[Tensor(1, T)] -> Tensor(1, Total_T) -> Tensor(1, Total_T)\n            # Ensure we are handling the list concatenation correctly\n            if len(vq_parts) > 1:\n                # We need to be careful here: vq_parts is a list of tensors from different VQPart segments\n                # They correspond to encoded.vq_mask_tokens\n                # Since we just want to fill the 'values' tensor at the right positions:\n                all_vq_codes = torch.cat(\n                    vq_parts, dim=1\n                )  # Shape: (C, Total_Semantic_Tokens)\n            else:\n                all_vq_codes = vq_parts[0]\n\n            # Values[0] is already the Main Token ID (Semantic Begin + Code)\n            # Values[1:] should be the codes themselves\n            values[1:, encoded.vq_mask_tokens] = all_vq_codes.to(dtype=torch.long)\n\n        if encoded.audio_parts is not None and len(encoded.audio_parts) > 0:\n            audio_parts = torch.cat(encoded.audio_parts, dim=0)\n            audio_masks = encoded.audio_masks[None, :]\n\n        return values, audio_masks, audio_parts\n\n    def visualize(\n        self: \"ContentSequence\",\n        tokenizer: FishTokenizer,\n        ignore_loss_tokens: list[str] = [],\n        merge_semantic_tokens: bool = False,\n    ):\n        \"\"\"\n        Visualize the encoded sequence with color-coded tokens.\n        Blue/cyan tokens contribute to loss, green tokens do not.\n        \"\"\"\n        encoded = self.encode(\n            tokenizer, add_shift=False, ignore_loss_tokens=ignore_loss_tokens\n        )\n\n        # Colors for alternating tokens\n        colors = {\n            \"blue\": \"\\033[94m\",  # Light blue\n            \"cyan\": \"\\033[96m\",  # Cyan\n            \"green\": \"\\033[92m\",  # Light green\n            \"dark_green\": \"\\033[32m\",  # Dark green\n        }\n        blue_idx = 0\n        green_idx = 0\n\n        def print_in_blue(x):\n            nonlocal blue_idx\n            color = colors[\"blue\"] if blue_idx % 2 == 0 else colors[\"cyan\"]\n            print(f\"{color}{x}\\033[0m\", end=\"\")\n            blue_idx += 1\n\n        def print_in_green(x):\n            nonlocal green_idx\n            color = colors[\"green\"] if green_idx % 2 == 0 else colors[\"dark_green\"]\n            print(f\"{color}{x}\\033[0m\", end=\"\")\n            green_idx += 1\n\n        def print_semantic_token(x, count):\n            val = f\"[<|semantic|>x{count}]\"\n            if x == -100:\n                print_in_green(val)\n            else:\n                print_in_blue(val)\n\n        count_semantic_tokens = 0\n        semantic_label = None\n\n        for tok, lab in zip(encoded.tokens, encoded.labels):\n            token_id = int(tok.item())\n\n            if merge_semantic_tokens:\n                if (\n                    tokenizer.semantic_begin_id <= token_id <= tokenizer.semantic_end_id\n                    and (semantic_label is None or semantic_label == lab)\n                ):\n                    count_semantic_tokens += 1\n                    semantic_label = lab\n                    continue\n                elif count_semantic_tokens > 0:\n                    print_semantic_token(semantic_label, count_semantic_tokens)\n                    count_semantic_tokens = 0\n                    semantic_label = None\n\n            # Use HF decode\n            val = tokenizer.decode([token_id])\n\n            # Simple fallback for visualization if decode returns empty or weird stuff for special tokens\n            if not val:\n                val = f\"<{token_id}>\"\n\n            if lab == -100:\n                print_in_green(val)\n            else:\n                print_in_blue(val)\n\n        if merge_semantic_tokens and count_semantic_tokens > 0:\n            print_semantic_token(semantic_label, count_semantic_tokens)\n\n        print()\n"
  },
  {
    "path": "fish_speech/conversation.py",
    "content": "from copy import deepcopy\nfrom dataclasses import dataclass, field\nfrom typing import Literal\n\nimport torch\nfrom transformers import PreTrainedTokenizerFast\n\nfrom fish_speech.content_sequence import (\n    AudioPart,\n    BasePart,\n    ContentSequence,\n    EncodedMessage,\n    TextPart,\n    VQPart,\n)\nfrom fish_speech.tokenizer import IM_END_TOKEN, IM_START_TOKEN, MODALITY_TOKENS\n\n\n@dataclass(kw_only=True)\nclass Message:\n    role: Literal[\"system\", \"user\", \"assistant\"]\n    parts: list[BasePart] = field(default_factory=list)\n    add_im_start: bool = True\n    add_im_end: bool = True\n    cal_loss: bool = False\n    modality: Literal[\"text\", \"voice\", \"interleave\"] | None = None\n\n    # By default, ignore the loss of the auto-generated im_start token\n    ignore_im_start_loss: bool = True\n\n\n@dataclass\nclass Conversation:\n    messages: list[Message]\n\n    def __init__(self: \"Conversation\", messages: list[Message] | None = None):\n        self.messages = messages or []\n\n    def _build_content_sequence(\n        self: \"Conversation\",\n        metadata: dict | None = None,\n    ) -> ContentSequence:\n        \"\"\"\n        Build a ContentSequence from all messages.\n        Handles cal_loss inheritance from message to part level.\n        \"\"\"\n        all_parts = []\n        for message in self.messages:\n            # Add im_start\n            if message.add_im_start:\n                modality_token = (\n                    MODALITY_TOKENS[message.modality] if message.modality else \"\"\n                )\n                all_parts.append(\n                    TextPart(\n                        text=f\"{IM_START_TOKEN}{message.role}\\n{modality_token}\",\n                        cal_loss=not message.ignore_im_start_loss,\n                    )\n                )\n\n            # Add message parts\n            for part in message.parts:\n                # Inherit cal_loss from message if not set at part level\n                if not hasattr(part, \"cal_loss\") or part.cal_loss is False:\n                    new_part = deepcopy(part)\n                    new_part.cal_loss = message.cal_loss\n                    all_parts.append(new_part)\n                else:\n                    all_parts.append(part)\n\n            # Add im_end\n            if message.add_im_end:\n                all_parts.append(\n                    TextPart(text=IM_END_TOKEN + \"\\n\", cal_loss=message.cal_loss)\n                )\n\n        return ContentSequence(parts=all_parts, modality=None, metadata=metadata)\n\n    def encode(\n        self: \"Conversation\",\n        tokenizer: any,\n        add_shift: bool = True,\n        ignore_loss_tokens: list[str] = [],\n        metadata: dict | None = None,\n        max_length: int | None = None,\n    ) -> EncodedMessage:\n        # Build ContentSequence from messages\n        content_seq = self._build_content_sequence(metadata=metadata)\n        return content_seq.encode(\n            tokenizer,\n            add_shift=add_shift,\n            ignore_loss_tokens=ignore_loss_tokens,\n            max_length=max_length,\n        )\n\n    def encode_for_inference(\n        self: \"Conversation\",\n        tokenizer: any,\n        num_codebooks: int,\n        metadata: dict | None = None,\n    ):\n        content_seq = self._build_content_sequence(metadata=metadata)\n        return content_seq.encode_for_inference(tokenizer, num_codebooks=num_codebooks)\n\n    def visualize(\n        self: \"Conversation\",\n        tokenizer: PreTrainedTokenizerFast,\n        ignore_loss_tokens: list[str] = [],\n        merge_semantic_tokens: bool = False,\n        merge_audio_tokens: bool = False,\n        use_color: bool = True,\n    ):\n        \"\"\"\n        Visualize the encoded sequence with color-coded tokens.\n        Blue/cyan tokens contribute to loss, green tokens do not.\n        \"\"\"\n        # Build ContentSequence from messages and use its visualize method\n        content_seq = self._build_content_sequence()\n        content_seq.visualize(\n            tokenizer,\n            ignore_loss_tokens=ignore_loss_tokens,\n            merge_semantic_tokens=merge_semantic_tokens,\n        )\n\n    def append(self: \"Conversation\", message: Message):\n        self.messages.append(message)\n\n    def to_content_sequence(\n        self: \"Conversation\",\n        metadata: dict | None = None,\n    ) -> ContentSequence:\n        \"\"\"\n        Convert the Conversation to a ContentSequence.\n\n        This method builds a ContentSequence from all messages,\n        handling cal_loss inheritance from message to part level.\n\n        Args:\n            metadata: Optional metadata to include in the ContentSequence\n\n        Returns:\n            ContentSequence with all messages converted to parts\n        \"\"\"\n        return self._build_content_sequence(metadata=metadata)\n\n\nif __name__ == \"__main__\":\n    # Test the new implementation with the same API\n    message0 = Message(\n        role=\"user\",\n        parts=[\n            TextPart(text=\"Hello, how are you?\"),\n            VQPart(codes=torch.zeros((4, 10))),\n        ],\n        cal_loss=False,\n    )\n\n    message1 = Message(\n        role=\"assistant\",\n        parts=[TextPart(text=\"I'm fine, thank you.\")],\n        cal_loss=True,\n    )\n    conversation = Conversation([message0, message1])\n    tokenizer = PreTrainedTokenizerFast.from_pretrained(\"checkpoints/agent-0.6b-debug\")\n\n    # Test with enhanced visualization from ContentSequence\n    print(\"Basic visualization:\")\n    conversation.visualize(tokenizer)\n\n    print(\"\\nWith merged semantic tokens:\")\n    conversation.visualize(tokenizer, merge_semantic_tokens=True)\n\n    print(\"\\nWithout colors:\")\n    conversation.visualize(tokenizer, use_color=False)\n"
  },
  {
    "path": "fish_speech/datasets/concat_repeat.py",
    "content": "import bisect\nimport random\nfrom typing import Iterable\n\nfrom torch.utils.data import Dataset, IterableDataset\n\n\nclass ConcatRepeatDataset(Dataset):\n    datasets: list[Dataset]\n    cumulative_sizes: list[int]\n    repeats: list[int]\n\n    @staticmethod\n    def cumsum(sequence, repeats):\n        r, s = [], 0\n        for dataset, repeat in zip(sequence, repeats):\n            l = len(dataset) * repeat\n            r.append(l + s)\n            s += l\n        return r\n\n    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):\n        super().__init__()\n\n        self.datasets = list(datasets)\n        self.repeats = repeats\n\n        assert len(self.datasets) > 0, \"datasets should not be an empty iterable\"\n        assert len(self.datasets) == len(\n            repeats\n        ), \"datasets and repeats should have the same length\"\n\n        for d in self.datasets:\n            assert not isinstance(\n                d, IterableDataset\n            ), \"ConcatRepeatDataset does not support IterableDataset\"\n\n        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)\n\n    def __len__(self):\n        return self.cumulative_sizes[-1]\n\n    def __getitem__(self, idx):\n        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)\n\n        if dataset_idx == 0:\n            sample_idx = idx\n        else:\n            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]\n\n        dataset = self.datasets[dataset_idx]\n\n        return dataset[sample_idx % len(dataset)]\n"
  },
  {
    "path": "fish_speech/datasets/protos/text-data.proto",
    "content": "syntax = \"proto3\";\n\npackage text_data;\n\nmessage Semantics {\n    repeated uint32 values = 1;\n}\n\nmessage Sentence {\n    repeated string texts = 1;\n    repeated Semantics semantics = 3;\n}\n\nmessage TextData {\n    string source = 1;\n    string name = 2;\n    repeated Sentence sentences = 4;\n}\n\nmessage SampledData {\n    string source = 1;\n    string name = 2;\n    repeated Sentence samples = 3;\n}\n"
  },
  {
    "path": "fish_speech/datasets/protos/text_data_pb2.py",
    "content": "# -*- coding: utf-8 -*-\n# Generated by the protocol buffer compiler.  DO NOT EDIT!\n# source: text-data.proto\n# Protobuf Python Version: 4.25.1\n\"\"\"Generated protocol buffer code.\"\"\"\n\nfrom google.protobuf import descriptor as _descriptor\nfrom google.protobuf import descriptor_pool as _descriptor_pool\nfrom google.protobuf import symbol_database as _symbol_database\nfrom google.protobuf.internal import builder as _builder\n\n# @@protoc_insertion_point(imports)\n\n_sym_db = _symbol_database.Default()\n\n\nDESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(\n    b'\\n\\x0ftext-data.proto\\x12\\ttext_data\"\\x1b\\n\\tSemantics\\x12\\x0e\\n\\x06values\\x18\\x01 \\x03(\\r\"B\\n\\x08Sentence\\x12\\r\\n\\x05texts\\x18\\x01 \\x03(\\t\\x12\\'\\n\\tsemantics\\x18\\x03 \\x03(\\x0b\\x32\\x14.text_data.Semantics\"P\\n\\x08TextData\\x12\\x0e\\n\\x06source\\x18\\x01 \\x01(\\t\\x12\\x0c\\n\\x04name\\x18\\x02 \\x01(\\t\\x12&\\n\\tsentences\\x18\\x04 \\x03(\\x0b\\x32\\x13.text_data.Sentence\"Q\\n\\x0bSampledData\\x12\\x0e\\n\\x06source\\x18\\x01 \\x01(\\t\\x12\\x0c\\n\\x04name\\x18\\x02 \\x01(\\t\\x12$\\n\\x07samples\\x18\\x03 \\x03(\\x0b\\x32\\x13.text_data.Sentenceb\\x06proto3'\n)\n\n_globals = globals()\n_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)\n_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, \"text_data_pb2\", _globals)\nif _descriptor._USE_C_DESCRIPTORS == False:\n    DESCRIPTOR._options = None\n    _globals[\"_SEMANTICS\"]._serialized_start = 30\n    _globals[\"_SEMANTICS\"]._serialized_end = 57\n    _globals[\"_SENTENCE\"]._serialized_start = 59\n    _globals[\"_SENTENCE\"]._serialized_end = 125\n    _globals[\"_TEXTDATA\"]._serialized_start = 127\n    _globals[\"_TEXTDATA\"]._serialized_end = 207\n    _globals[\"_SAMPLEDDATA\"]._serialized_start = 209\n    _globals[\"_SAMPLEDDATA\"]._serialized_end = 290\n# @@protoc_insertion_point(module_scope)\n"
  },
  {
    "path": "fish_speech/datasets/protos/text_data_stream.py",
    "content": "import struct\n\nfrom .text_data_pb2 import TextData\n\n\ndef read_pb_stream(f):\n    while True:\n        buf = f.read(4)\n        if len(buf) == 0:\n            break\n        size = struct.unpack(\"I\", buf)[0]\n        buf = f.read(size)\n        text_data = TextData()\n        text_data.ParseFromString(buf)\n        yield text_data\n\n\ndef write_pb_stream(f, text_data):\n    buf = text_data.SerializeToString()\n    f.write(struct.pack(\"I\", len(buf)))\n    f.write(buf)\n\n\ndef pack_pb_stream(text_data):\n    buf = text_data.SerializeToString()\n    return struct.pack(\"I\", len(buf)) + buf\n\n\ndef split_pb_stream(f):\n    while True:\n        head = f.read(4)\n        if len(head) == 0:\n            break\n        size = struct.unpack(\"I\", head)[0]\n        buf = f.read(size)\n        yield head + buf\n"
  },
  {
    "path": "fish_speech/datasets/semantic.py",
    "content": "import random\nfrom dataclasses import dataclass\nfrom itertools import chain\nfrom pathlib import Path\nfrom random import Random\nfrom typing import Optional, Union\n\nimport numpy as np\nimport pyarrow.parquet as pq\nimport torch\nimport torch.nn.functional as F\nfrom datasets.download.streaming_download_manager import xopen\nfrom huggingface_hub import HfApi\nfrom lightning import LightningDataModule\nfrom torch.distributed import get_rank, get_world_size, is_initialized\nfrom torch.utils.data import DataLoader, Dataset, IterableDataset, get_worker_info\n\nfrom fish_speech.content_sequence import ContentSequence, TextPart, VQPart\n\nCODEBOOK_PAD_TOKEN_ID = 0\n\nfrom fish_speech.datasets.protos.text_data_pb2 import SampledData\nfrom fish_speech.datasets.protos.text_data_stream import read_pb_stream\nfrom fish_speech.text.clean import clean_text\nfrom fish_speech.tokenizer import FishTokenizer\nfrom fish_speech.utils import RankedLogger\nfrom fish_speech.utils.braceexpand import braceexpand\n\nlog = RankedLogger(__name__, rank_zero_only=True)\n\n\ndef split_by_rank_worker(files):\n    # We need to know the total number of devices\n    # to split the data properly\n\n    total_devices = 1\n    if is_initialized():\n        total_devices = get_world_size()\n\n    worker_info = get_worker_info()\n    if worker_info is not None:\n        total_devices *= worker_info.num_workers\n\n    if len(files) < total_devices:\n        # Repeat the files N times to match the number of devices\n        files = files * (total_devices // len(files) + 1)\n\n    # DDP\n    if is_initialized():\n        files = files[get_rank() :: get_world_size()]\n\n    # Split by worker\n    if worker_info is not None:\n        files = files[worker_info.id :: worker_info.num_workers]\n\n    return files\n\n\nclass AutoTextSemanticInstructionIterableDataset(IterableDataset):\n    \"\"\"\n    Auto Augment Dataset by Speaker\n\n    1. Random concatenate multiple sentences from the same speaker to form a longer sentence\n    2. Automatically normalize the text\n\n    For interactive mode, we use the following format (multiple sequences):\n    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>\n\n    For non-interactive mode, we use the following format (one long sequence):\n    <s> [INST] text [/INST] ... </s>\n    \"\"\"\n\n    def __init__(\n        self,\n        proto_files: list[str],\n        seed: int = 42,\n        interactive_prob: float = 0.5,\n        max_length: int = 1024,\n        tokenizer: FishTokenizer = None,\n        use_speaker: bool | float = True,\n        causal: bool = True,\n        num_codebooks: Optional[int] = None,\n        skip_text_prob: float = 0.0,\n    ):\n        \"\"\"\n        Args:\n            proto_files: proto buf files if using local data\n            seed: random seed\n            interactive_prob: probability to use interactive mode\n            max_length: max length of the text\n            tokenizer: tokenizer\n            use_speaker: include speaker information in the prompt\n            causal: use causal sampling when using local data, disable will lead to random sampling\n            num_codebooks: number of codebooks, if None, it will be automatically detected\n            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode\n        \"\"\"\n\n        super().__init__()\n\n        assert 0 <= interactive_prob <= 1, \"interactive_prob must be in [0, 1]\"\n\n        self.seed = seed\n        self.max_length = max_length\n        self.tokenizer = tokenizer\n        self.interactive_prob = interactive_prob\n        self.use_speaker = use_speaker\n        self.proto_files = proto_files\n        self.causal = causal\n        self.num_codebooks = num_codebooks\n        self.skip_text_prob = skip_text_prob\n\n        self.groups = None\n\n    def __iter__(self):\n        while True:\n            yield self.augment()\n\n    def init_mock_data_server(self):\n        if self.groups is not None:\n            return\n\n        # Expand the proto files\n        expanded_proto_files = []\n        for filename in self.proto_files:\n            for i in braceexpand(filename):\n                i = Path(i)\n                if i.is_file():\n                    expanded_proto_files.append(i)\n                elif i.is_dir():\n                    expanded_proto_files.extend(i.rglob(\"*.proto\"))\n                    expanded_proto_files.extend(i.rglob(\"*.protos\"))\n                else:\n                    raise ValueError(f\"{i} is not a file or directory\")\n\n        expanded_proto_files = sorted(expanded_proto_files)\n        Random(self.seed).shuffle(expanded_proto_files)\n\n        self.groups = []\n        shard_proto_files = split_by_rank_worker(expanded_proto_files)\n        log.info(\n            f\"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files\"\n        )\n\n        count = 0\n        for filename in shard_proto_files:\n            with open(filename, \"rb\") as f:\n                for text_data in read_pb_stream(f):\n                    self.groups.append(text_data)\n                    count += 1\n\n        log.info(f\"Read total {count} groups of data\")\n\n        # Shuffle the lines\n        Random(self.seed).shuffle(self.groups)\n        self.group_weights = [len(i.sentences) for i in self.groups]\n\n    def sample_data(self):\n        if self.groups is None:\n            self.init_mock_data_server()\n\n        # Shuffle unique lines, estimate that each sample is at least 20 tokens\n        num_samples = self.max_length // 20\n\n        # choice group based on their number of samples\n        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]\n\n        if self.causal:\n            # Sample in order\n            if num_samples >= len(group.sentences):\n                samples = group.sentences\n            else:\n                begin = random.randint(0, len(group.sentences) - num_samples)\n                samples = group.sentences[begin : begin + num_samples]\n        else:\n            samples = random.choices(\n                group.sentences, k=min(num_samples, len(group.sentences))\n            )\n\n        return SampledData(\n            source=group.source,\n            name=group.name,\n            samples=samples,\n        )\n\n    def pack_sentences(\n        self,\n        sentences: list[str],\n        semantics: list,\n        # speaker: Optional[str] = None, # speaker is now handled by tokens\n        skip_text: bool = False,\n    ):\n\n        seq = ContentSequence()\n\n        seq.append(TextPart(text=\"Speak out the provided text.\"))\n\n        # User's turn\n        cated_sentences = \" \".join(sentences)\n        if skip_text:\n            cated_sentences = \"<|skip_text|>\"\n\n        seq.append(\n            TextPart(text=f\"<|speaker:user|> {cated_sentences}\"),\n            add_end=True,\n        )\n\n        # Assistant's turn\n        vq_codes = [x.values for x in semantics[0]]\n        vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)\n\n        # 将 cal_loss=True 直接关联到 VQPart 上，这比之前更精确\n        vq_part = VQPart(codes=vq_codes_tensor, cal_loss=True)\n\n        # 将多个 parts 一起添加，最后也加上 <|im_end|>\n        seq.append(\n            [TextPart(text=\"<|speaker:assistant|> <|voice|>\"), vq_part],\n            add_end=True,\n        )\n\n        encoded = seq.encode(\n            tokenizer=self.tokenizer,\n        )\n\n        num_codebooks = (\n            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks\n        )\n\n        tokens_raw = encoded.tokens\n        tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)\n        tokens[0] = tokens_raw\n\n        vq_parts = encoded.vq_parts\n        vq_parts = [part.to(tokens.device) for part in vq_parts]\n        vq_parts = torch.cat(vq_parts, dim=1)\n        tokens[1:, encoded.vq_mask_tokens] = vq_parts\n\n        labels_raw = encoded.labels\n        labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)\n        labels[0, :] = labels_raw\n        labels[1:, encoded.vq_mask_labels] = vq_parts\n        labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID\n\n        tokens = tokens.long()\n        labels = labels.long()\n\n        # Verify the padding is correct, and the last token is eos\n        assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()\n        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()\n\n        return tokens, labels\n\n    def augment(self):\n        response = self.sample_data()\n        if len(response.samples) == 0:\n            # Invalid group\n            return None\n\n        samples = list(response.samples)\n        all_tokens, all_labels = [], []\n\n        while len(samples) > 0:\n            sentence = samples.pop(0)\n            text = clean_text(random.choice(sentence.texts))\n\n            tokens, labels = self.pack_sentences(\n                sentences=[text],\n                semantics=[sentence.semantics],\n                # speaker=response.name if use_speaker else None,\n                skip_text=random.random() < self.skip_text_prob,\n            )\n\n            all_tokens.append(tokens)\n            all_labels.append(labels)\n\n        tokens = torch.cat(all_tokens, dim=1)\n        labels = torch.cat(all_labels, dim=1)\n\n        # Verify that the length is correct\n        assert tokens.size(1) == labels.size(1), f\"{tokens.size(1)} != {labels.size(1)}\"\n\n        data = {\"tokens\": tokens, \"labels\": labels}\n\n        return data\n\n\nclass AutoTextSemanticInstructionDataset(Dataset):\n    \"\"\"\n    Auto Augment Dataset by Speaker\n\n    1. Random concatenate multiple sentences from the same speaker to form a longer sentence\n    2. Automatically normalize the text\n\n    For interactive mode, we use the following format (multiple sequences):\n    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>\n\n    For non-interactive mode, we use the following format (one long sequence):\n    <s> [INST] text [/INST] ... </s>\n    \"\"\"\n\n    def __init__(\n        self,\n        proto_files: list[str],\n        seed: int = 42,\n        interactive_prob: float = 0.5,\n        max_length: int = 1024,\n        tokenizer: FishTokenizer = None,\n        use_speaker: bool | float = True,\n        causal: bool = True,\n        num_codebooks: Optional[int] = None,\n        skip_text_prob: float = 0.0,\n    ):\n        \"\"\"\n        Args:\n            proto_files: proto buf files if using local data\n            seed: random seed\n            interactive_prob: probability to use interactive mode\n            max_length: max length of the text\n            tokenizer: tokenizer\n            use_speaker: include speaker information in the prompt\n            causal: use causal sampling when using local data, disable will lead to random sampling\n            num_codebooks: number of codebooks, if None, it will be automatically detected\n            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode\n        \"\"\"\n        super().__init__()\n\n        assert 0 <= interactive_prob <= 1, \"interactive_prob must be in [0, 1]\"\n\n        self.seed = seed\n        self.max_length = max_length\n        self.tokenizer = tokenizer\n        self.interactive_prob = interactive_prob\n        self.use_speaker = use_speaker\n        self.proto_files = proto_files\n        self.causal = causal\n        self.num_codebooks = num_codebooks\n        self.skip_text_prob = skip_text_prob\n\n        self.data = []\n        self._init_data()\n\n    def _init_data(self):\n        expanded_proto_files = []\n        for filename in self.proto_files:\n            for i in braceexpand(filename):\n                i = Path(i)\n                if i.is_file():\n                    expanded_proto_files.append(i)\n                elif i.is_dir():\n                    expanded_proto_files.extend(i.rglob(\"*.proto\"))\n                    expanded_proto_files.extend(i.rglob(\"*.protos\"))\n                else:\n                    raise ValueError(f\"{i} is not a file or directory\")\n\n        expanded_proto_files = sorted(expanded_proto_files)\n        Random(self.seed).shuffle(expanded_proto_files)\n\n        groups = []\n        shard_proto_files = split_by_rank_worker(expanded_proto_files)\n        log.info(\n            f\"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files\"\n        )\n\n        count = 0\n        for filename in shard_proto_files:\n            with open(filename, \"rb\") as f:\n                for text_data in read_pb_stream(f):\n                    groups.append(text_data)\n                    count += 1\n\n        log.info(f\"Read total {count} groups of data\")\n\n        for group in groups:\n            if len(group.sentences) == 0:\n                continue\n\n            samples = list(group.sentences)\n            for sentence in samples:\n                text = clean_text(random.choice(sentence.texts))\n\n                tokens, labels = self.pack_sentences(\n                    sentences=[text],\n                    semantics=[sentence.semantics],\n                    skip_text=random.random() < self.skip_text_prob,\n                )\n\n                self.data.append({\"tokens\": tokens, \"labels\": labels})\n\n        random.Random(self.seed).shuffle(self.data)\n\n    def __len__(self):\n        return len(self.data)\n\n    def __getitem__(self, idx):\n        return self.data[idx]\n\n    def pack_sentences(\n        self,\n        sentences: list[str],\n        semantics: list,\n        skip_text: bool = False,\n    ):\n        messages = [\n            Message(\n                role=\"system\",\n                parts=[TextPart(text=\"Speak out the provided text.\")],\n            )\n        ]\n\n        cated_sentences = \" \".join(sentences)\n        if skip_text:\n            cated_sentences = \"<|skip_text|>\"\n\n        messages.append(\n            Message(\n                role=\"user\",\n                parts=[TextPart(text=cated_sentences)],\n            )\n        )\n\n        vq_codes = [x.values for x in semantics[0]]\n        vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)\n        vqpart = VQPart(codes=vq_codes_tensor)\n        messages.append(\n            Message(\n                role=\"assistant\",\n                parts=[TextPart(text=\"<|voice|>\"), vqpart],\n                cal_loss=True,\n            )\n        )\n\n        num_codebooks = (\n            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks\n        )\n\n        conversation = Conversation(messages=messages)\n        encoded = conversation.encode(\n            tokenizer=self.tokenizer,\n        )\n\n        tokens_raw = encoded.tokens\n        tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)\n        tokens[0] = tokens_raw\n\n        vq_parts = encoded.vq_parts\n        vq_parts = [part.to(tokens.device) for part in vq_parts]\n        vq_parts = torch.cat(vq_parts, dim=1)\n        tokens[1:, encoded.vq_mask_tokens] = vq_parts\n\n        labels_raw = encoded.labels\n        labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)\n        labels[0, :] = labels_raw\n        labels[1:, encoded.vq_mask_labels] = vq_parts\n        labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID\n\n        tokens = tokens.long()\n        labels = labels.long()\n\n        assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()\n        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()\n\n        return tokens, labels\n\n\nclass InterleaveDataset(IterableDataset):\n    def __init__(\n        self,\n        datasets: list[IterableDataset],\n        probabilities: list[float],\n        seed: int = 42,\n    ):\n        super().__init__()\n\n        self.datasets = datasets\n        self.probabilities = probabilities\n        self.seed = seed\n\n    def __iter__(self):\n        rng = np.random.default_rng(self.seed)\n        dataset_iterators = [iter(dataset) for dataset in self.datasets]\n\n        while True:\n            # Random choice one\n            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)\n            dataset_iterator = dataset_iterators[dataset_idx]\n\n            try:\n                yield next(dataset_iterator)\n            except StopIteration:\n                # Exhausted, create a new iterator\n                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])\n                yield next(dataset_iterators[dataset_idx])\n\n\n@dataclass\nclass TextDataCollator:\n    tokenizer: FishTokenizer\n    max_length: int = 1024\n\n    def __call__(self, examples):\n        if \"negative_tokens\" in examples:\n            positive_examples = []\n            negative_examples = []\n\n            for i in examples:\n                positive_examples.append(\n                    {\n                        \"tokens\": i[\"tokens\"],\n                        \"labels\": i[\"labels\"],\n                    }\n                )\n                negative_examples.append(\n                    {\n                        \"tokens\": i[\"negative_tokens\"],\n                        \"labels\": i[\"negative_labels\"],\n                    }\n                )\n\n            examples = positive_examples + negative_examples\n\n        return self.batchify(examples)\n\n    def batchify(self, examples, tokens_key=\"tokens\", labels_key=\"labels\"):\n        tokens, attention_masks, labels = [], [], []\n\n        # Calculate the max length\n        max_tokens_length = 0\n        for example in examples:\n            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))\n        max_tokens_length = min(max_tokens_length, self.max_length)\n\n        for example in examples:\n            _tokens = example[tokens_key][:, :max_tokens_length]\n            _labels = example[labels_key][:, :max_tokens_length]\n            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)\n            tokens_length = _tokens.size(1)\n            _attention_mask[:tokens_length] = False\n\n            assert tokens_length == _labels.size(\n                1\n            ), f\"{tokens_length} != {_labels.size(1)}\"\n\n            if tokens_length < max_tokens_length:\n                _tokens = F.pad(\n                    _tokens,\n                    (0, max_tokens_length - tokens_length),\n                    value=self.tokenizer.get_token_id(\"<|end_of_text|>\"),\n                )\n                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID\n                _labels = F.pad(\n                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100\n                )\n\n            tokens.append(_tokens)\n            attention_masks.append(_attention_mask)\n            labels.append(_labels)\n\n        tokens = torch.stack(tokens, dim=0)\n        attention_masks = torch.stack(attention_masks, dim=0)\n        labels = torch.stack(labels, dim=0)\n\n        return {\n            \"inputs\": tokens,\n            \"attention_masks\": attention_masks,\n            \"labels\": labels,\n        }\n\n\nclass SemanticDataModule(LightningDataModule):\n    def __init__(\n        self,\n        train_dataset: Union[\n            AutoTextSemanticInstructionDataset,\n            AutoTextSemanticInstructionIterableDataset,\n            InterleaveDataset,\n        ],\n        val_dataset: Union[\n            AutoTextSemanticInstructionDataset,\n            AutoTextSemanticInstructionIterableDataset,\n            InterleaveDataset,\n        ],\n        batch_size: int = 32,\n        tokenizer: FishTokenizer = None,\n        max_length: int = 1024,\n        num_workers: int = 4,\n    ):\n        super().__init__()\n\n        self.train_dataset = train_dataset\n        self.val_dataset = val_dataset\n        self.batch_size = batch_size\n        self.tokenizer = tokenizer\n        self.max_length = max_length\n        self.num_workers = num_workers\n\n    def train_dataloader(self):\n        return DataLoader(\n            self.train_dataset,\n            batch_size=self.batch_size,\n            collate_fn=TextDataCollator(self.tokenizer, self.max_length),\n            num_workers=self.num_workers,\n            persistent_workers=True,\n        )\n\n    def val_dataloader(self):\n        return DataLoader(\n            self.val_dataset,\n            batch_size=self.batch_size,\n            collate_fn=TextDataCollator(self.tokenizer, self.max_length),\n            num_workers=self.num_workers,\n            persistent_workers=True,\n        )\n\n\nif __name__ == \"__main__\":\n    from tqdm import tqdm\n\n    ds = AutoTextSemanticInstructionDataset(\n        [\"data/protos\"],\n        tokenizer=FishTokenizer(\"checkpoints/fish-speech-1.5/tokenizer.tiktoken\"),\n        use_speaker=False,\n        interactive_prob=1.0,\n        skip_text_prob=0.5,\n    )\n\n    for i in range(100):\n        # Please uncomment line 235 to visualize the tokenized message\n        print(ds[i])\n"
  },
  {
    "path": "fish_speech/datasets/vqgan.py",
    "content": "from dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Optional\n\nimport librosa\nimport numpy as np\nimport torch\nfrom lightning import LightningDataModule\nfrom torch.utils.data import DataLoader, Dataset\n\nfrom fish_speech.utils import RankedLogger\n\nlogger = RankedLogger(__name__, rank_zero_only=False)\n\n\nclass VQGANDataset(Dataset):\n    def __init__(\n        self,\n        filelist: str,\n        sample_rate: int = 32000,\n        hop_length: int = 640,\n        slice_frames: Optional[int] = None,\n    ):\n        super().__init__()\n\n        filelist = Path(filelist)\n        root = filelist.parent\n\n        self.files = [\n            root / line.strip()\n            for line in filelist.read_text(encoding=\"utf-8\").splitlines()\n            if line.strip()\n        ]\n        self.sample_rate = sample_rate\n        self.hop_length = hop_length\n        self.slice_frames = slice_frames\n\n    def __len__(self):\n        return len(self.files)\n\n    def get_item(self, idx):\n        file = self.files[idx]\n\n        audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)\n\n        # Slice audio and features\n        if (\n            self.slice_frames is not None\n            and audio.shape[0] > self.slice_frames * self.hop_length\n        ):\n            start = np.random.randint(\n                0, audio.shape[0] - self.slice_frames * self.hop_length\n            )\n            audio = audio[start : start + self.slice_frames * self.hop_length]\n\n        if len(audio) == 0:\n            return None\n\n        max_value = np.abs(audio).max()\n        if max_value > 1.0:\n            audio = audio / max_value\n\n        return {\n            \"audio\": torch.from_numpy(audio),\n        }\n\n    def __getitem__(self, idx):\n        try:\n            return self.get_item(idx)\n        except Exception as e:\n            import traceback\n\n            traceback.print_exc()\n            logger.error(f\"Error loading {self.files[idx]}: {e}\")\n            return None\n\n\n@dataclass\nclass VQGANCollator:\n    def __call__(self, batch):\n        batch = [x for x in batch if x is not None]\n\n        audio_lengths = torch.tensor([len(x[\"audio\"]) for x in batch])\n        audio_maxlen = audio_lengths.max()\n\n        # Rounds up to nearest multiple of 2 (audio_lengths)\n        audios = []\n        for x in batch:\n            audios.append(\n                torch.nn.functional.pad(x[\"audio\"], (0, audio_maxlen - len(x[\"audio\"])))\n            )\n\n        return {\n            \"audios\": torch.stack(audios),\n            \"audio_lengths\": audio_lengths,\n        }\n\n\nclass VQGANDataModule(LightningDataModule):\n    def __init__(\n        self,\n        train_dataset: VQGANDataset,\n        val_dataset: VQGANDataset,\n        batch_size: int = 32,\n        num_workers: int = 4,\n        val_batch_size: Optional[int] = None,\n    ):\n        super().__init__()\n\n        self.train_dataset = train_dataset\n        self.val_dataset = val_dataset\n        self.batch_size = batch_size\n        self.val_batch_size = val_batch_size or batch_size\n        self.num_workers = num_workers\n\n    def train_dataloader(self):\n        return DataLoader(\n            self.train_dataset,\n            batch_size=self.batch_size,\n            collate_fn=VQGANCollator(),\n            num_workers=self.num_workers,\n            shuffle=True,\n            persistent_workers=True,\n        )\n\n    def val_dataloader(self):\n        return DataLoader(\n            self.val_dataset,\n            batch_size=self.val_batch_size,\n            collate_fn=VQGANCollator(),\n            num_workers=self.num_workers,\n            persistent_workers=True,\n        )\n\n\nif __name__ == \"__main__\":\n    dataset = VQGANDataset(\"data/LibriTTS_R/vq_train_filelist.txt\")\n    dataloader = DataLoader(\n        dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()\n    )\n\n    for batch in dataloader:\n        print(batch[\"audios\"].shape)\n        print(batch[\"features\"].shape)\n        print(batch[\"audio_lengths\"])\n        print(batch[\"feature_lengths\"])\n        break\n"
  },
  {
    "path": "fish_speech/i18n/README.md",
    "content": "## i18n Folder Attribution\n\nThe `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:\n\n### fish_speech/i18n/core.py\n\n**Related code from RVC:**\n[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)\n\n**Initial commit:**\nadd localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)\n\n**Initial author:**\n[@L4Ph](https://github.com/L4Ph)\n\n### fish_speech/i18n/scan.py\n\n**Related code from RVC:**\n[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)\n\n**Initial commit:**\nFile for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)\n\n**Initial author:**\n[@towzeur](https://github.com/towzeur)\n\nWe appreciate the contributions of the RVC project and its authors.\n"
  },
  {
    "path": "fish_speech/i18n/__init__.py",
    "content": "from .core import i18n\n\n__all__ = [\"i18n\"]\n"
  },
  {
    "path": "fish_speech/i18n/core.py",
    "content": "import json\nimport locale\nfrom pathlib import Path\n\nI18N_FILE_PATH = Path(__file__).parent / \"locale\"\nDEFAULT_LANGUAGE = \"en_US\"\n\n\ndef load_language_list(language):\n    with open(I18N_FILE_PATH / f\"{language}.json\", \"r\", encoding=\"utf-8\") as f:\n        language_list = json.load(f)\n\n    return language_list\n\n\nclass I18nAuto:\n    def __init__(self):\n        i18n_file = Path(\".locale\")\n\n        if i18n_file.exists():\n            with open(i18n_file, \"r\", encoding=\"utf-8\") as f:\n                language = f.read().strip()\n        else:\n            # getlocale can't identify the system's language ((None, None))\n            language = locale.getdefaultlocale()[0]\n\n        if (I18N_FILE_PATH / f\"{language}.json\").exists() is False:\n            language = DEFAULT_LANGUAGE\n\n        self.language = language\n        self.language_map = load_language_list(language)\n\n    def __call__(self, key):\n        return self.language_map.get(key, key)\n\n    def __repr__(self):\n        return \"Use Language: \" + self.language\n\n\ni18n = I18nAuto()\n"
  },
  {
    "path": "fish_speech/i18n/locale/en_US.json",
    "content": "{\n  \"16-mixed is recommended for 10+ series GPU\": \"16-mixed is recommended for 10+ series GPU\",\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"5 to 10 seconds of reference audio, useful for specifying speaker.\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\",\n  \"Accumulate Gradient Batches\": \"Accumulate Gradient Batches\",\n  \"Add to Processing Area\": \"Add to Processing Area\",\n  \"Added path successfully!\": \"Added path successfully!\",\n  \"Advanced Config\": \"Advanced Config\",\n  \"Base LLAMA Model\": \"Base LLAMA Model\",\n  \"Batch Inference\": \"Batch Inference\",\n  \"Batch Size\": \"Batch Size\",\n  \"Changing with the Model Path\": \"Changing with the Model Path\",\n  \"Chinese\": \"Chinese\",\n  \"Compile Model\": \"Compile Model\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"Compile the model can significantly reduce the inference time, but will increase cold start time\",\n  \"Copy\": \"Copy\",\n  \"Data Preprocessing\": \"Data Preprocessing\",\n  \"Data Preprocessing Path\": \"Data Preprocessing Path\",\n  \"Data Source\": \"Data Source\",\n  \"Decoder Model Config\": \"Decoder Model Config\",\n  \"Decoder Model Path\": \"Decoder Model Path\",\n  \"Disabled\": \"Disabled\",\n  \"Enable Reference Audio\": \"Enable Reference Audio\",\n  \"English\": \"English\",\n  \"Error Message\": \"Error Message\",\n  \"File Preprocessing\": \"File Preprocessing\",\n  \"Generate\": \"Generate\",\n  \"Generated Audio\": \"Generated Audio\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\",\n  \"Infer interface is closed\": \"Infer interface is closed\",\n  \"Inference Configuration\": \"Inference Configuration\",\n  \"Inference Server Configuration\": \"Inference Server Configuration\",\n  \"Inference Server Error\": \"Inference Server Error\",\n  \"Inferring interface is launched at {}\": \"Inferring interface is launched at {}\",\n  \"Initial Learning Rate\": \"Initial Learning Rate\",\n  \"Input Audio & Source Path for Transcription\": \"Input Audio & Source Path for Transcription\",\n  \"Input Text\": \"Input Text\",\n  \"Invalid path: {}\": \"Invalid path: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"It is recommended to use CUDA, if you have low configuration, use CPU\",\n  \"Iterative Prompt Length, 0 means off\": \"Iterative Prompt Length, 0 means off\",\n  \"Japanese\": \"Japanese\",\n  \"LLAMA Configuration\": \"LLAMA Configuration\",\n  \"LLAMA Model Config\": \"LLAMA Model Config\",\n  \"LLAMA Model Path\": \"LLAMA Model Path\",\n  \"Labeling Device\": \"Labeling Device\",\n  \"LoRA Model to be merged\": \"LoRA Model to be merged\",\n  \"Maximum Audio Duration\": \"Maximum Audio Duration\",\n  \"Maximum Length per Sample\": \"Maximum Length per Sample\",\n  \"Maximum Training Steps\": \"Maximum Training Steps\",\n  \"Maximum tokens per batch, 0 means no limit\": \"Maximum tokens per batch, 0 means no limit\",\n  \"Merge\": \"Merge\",\n  \"Merge LoRA\": \"Merge LoRA\",\n  \"Merge successfully\": \"Merge successfully\",\n  \"Minimum Audio Duration\": \"Minimum Audio Duration\",\n  \"Model Output Path\": \"Model Output Path\",\n  \"Model Size\": \"Model Size\",\n  \"Move\": \"Move\",\n  \"Move files successfully\": \"Move files successfully\",\n  \"No audio generated, please check the input text.\": \"No audio generated, please check the input text.\",\n  \"No selected options\": \"No selected options\",\n  \"Number of Workers\": \"Number of Workers\",\n  \"Open Inference Server\": \"Open Inference Server\",\n  \"Open Labeler WebUI\": \"Open Labeler WebUI\",\n  \"Open Tensorboard\": \"Open Tensorboard\",\n  \"Opened labeler in browser\": \"Opened labeler in browser\",\n  \"Optional Label Language\": \"Optional Label Language\",\n  \"Optional online ver\": \"Optional online ver\",\n  \"Output Path\": \"Output Path\",\n  \"Path error, please check the model file exists in the corresponding path\": \"Path error, please check the model file exists in the corresponding path\",\n  \"Precision\": \"Precision\",\n  \"Probability of applying Speaker Condition\": \"Probability of applying Speaker Condition\",\n  \"Put your text here.\": \"Put your text here.\",\n  \"Reference Audio\": \"Reference Audio\",\n  \"Reference Text\": \"Reference Text\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\",\n  \"Remove Selected Data\": \"Remove Selected Data\",\n  \"Removed path successfully!\": \"Removed path successfully!\",\n  \"Repetition Penalty\": \"Repetition Penalty\",\n  \"Save model every n steps\": \"Save model every n steps\",\n  \"Select LLAMA ckpt\": \"Select LLAMA ckpt\",\n  \"Select VITS ckpt\": \"Select VITS ckpt\",\n  \"Select VQGAN ckpt\": \"Select VQGAN ckpt\",\n  \"Select source file processing method\": \"Select source file processing method\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"Select the model to be trained (Depending on the Tab page you are on)\",\n  \"Selected: {}\": \"Selected: {}\",\n  \"Speaker\": \"Speaker\",\n  \"Speaker is identified by the folder name\": \"Speaker is identified by the folder name\",\n  \"Start Training\": \"Start Training\",\n  \"Streaming Audio\": \"Streaming Audio\",\n  \"Streaming Generate\": \"Streaming Generate\",\n  \"Tensorboard Host\": \"Tensorboard Host\",\n  \"Tensorboard Log Path\": \"Tensorboard Log Path\",\n  \"Tensorboard Port\": \"Tensorboard Port\",\n  \"Tensorboard interface is closed\": \"Tensorboard interface is closed\",\n  \"Tensorboard interface is launched at {}\": \"Tensorboard interface is launched at {}\",\n  \"Text is too long, please keep it under {} characters.\": \"Text is too long, please keep it under {} characters.\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\",\n  \"Training Configuration\": \"Training Configuration\",\n  \"Training Error\": \"Training Error\",\n  \"Training stopped\": \"Training stopped\",\n  \"Type name of the speaker\": \"Type name of the speaker\",\n  \"Type the path or select from the dropdown\": \"Type the path or select from the dropdown\",\n  \"Use LoRA\": \"Use LoRA\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"Use LoRA can save GPU memory, but may reduce the quality of the model\",\n  \"Use filelist\": \"Use filelist\",\n  \"Use large for 10G+ GPU, medium for 5G, small for 2G\": \"Use large for 10G+ GPU, medium for 5G, small for 2G\",\n  \"VITS Configuration\": \"VITS Configuration\",\n  \"VQGAN Configuration\": \"VQGAN Configuration\",\n  \"Validation Batch Size\": \"Validation Batch Size\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\",\n  \"WebUI Host\": \"WebUI Host\",\n  \"WebUI Port\": \"WebUI Port\",\n  \"Whisper Model\": \"Whisper Model\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\",\n  \"latest\": \"latest\",\n  \"new\": \"new\",\n  \"Realtime Transform Text\": \"Realtime Transform Text\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"Normalization Result Preview (Currently Only Chinese)\",\n  \"Text Normalization\": \"Text Normalization\",\n  \"Select Example Audio\": \"Select Example Audio\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/locale/es_ES.json",
    "content": "{\n  \"16-mixed is recommended for 10+ series GPU\": \"se recomienda 16-mixed para GPU de la serie 10+\",\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"5 a 10 segundos de audio de referencia, útil para especificar el hablante.\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).\",\n  \"Accumulate Gradient Batches\": \"Acumular lotes de gradientes\",\n  \"Add to Processing Area\": \"Agregar al Área de Procesamiento\",\n  \"Added path successfully!\": \"¡Ruta agregada exitosamente!\",\n  \"Advanced Config\": \"Configuración Avanzada\",\n  \"Base LLAMA Model\": \"Modelo Base LLAMA\",\n  \"Batch Inference\": \"Inferencia por Lote\",\n  \"Batch Size\": \"Tamaño del Lote\",\n  \"Changing with the Model Path\": \"Cambiando con la Ruta del Modelo\",\n  \"Chinese\": \"Chino\",\n  \"Compile Model\": \"Compilar Modelo\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío\",\n  \"Copy\": \"Copiar\",\n  \"Data Preprocessing\": \"Preprocesamiento de Datos\",\n  \"Data Preprocessing Path\": \"Ruta de Preprocesamiento de Datos\",\n  \"Data Source\": \"Fuente de Datos\",\n  \"Decoder Model Config\": \"Configuración del modelo decodificador\",\n  \"Decoder Model Path\": \"Ruta del modelo decodificador\",\n  \"Disabled\": \"Desactivado\",\n  \"Enable Reference Audio\": \"Habilitar Audio de Referencia\",\n  \"English\": \"Inglés\",\n  \"Error Message\": \"Mensaje de Error\",\n  \"File Preprocessing\": \"Preprocesamiento de Archivos\",\n  \"Generate\": \"Generar\",\n  \"Generated Audio\": \"Audio Generado\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab\",\n  \"Infer interface is closed\": \"La interfaz de inferencia está cerrada\",\n  \"Inference Configuration\": \"Configuración de Inferencia\",\n  \"Inference Server Configuration\": \"Configuración del Servidor de Inferencia\",\n  \"Inference Server Error\": \"Error del Servidor de Inferencia\",\n  \"Inferring interface is launched at {}\": \"La interfaz de inferencia se ha lanzado en {}\",\n  \"Initial Learning Rate\": \"Tasa de Aprendizaje Inicial\",\n  \"Input Audio & Source Path for Transcription\": \"Audio de Entrada y Ruta de Origen para Transcripción\",\n  \"Input Text\": \"Texto de Entrada\",\n  \"Invalid path: {}\": \"Ruta inválida: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"Se recomienda usar CUDA, si tiene una configuración baja, use CPU\",\n  \"Iterative Prompt Length, 0 means off\": \"Longitud de la Indicación Iterativa, 0 significa apagado\",\n  \"Japanese\": \"Japonés\",\n  \"LLAMA Configuration\": \"Configuración de LLAMA\",\n  \"LLAMA Model Config\": \"Configuración del Modelo LLAMA\",\n  \"LLAMA Model Path\": \"Ruta del Modelo LLAMA\",\n  \"Labeling Device\": \"Dispositivo de Etiquetado\",\n  \"LoRA Model to be merged\": \"Modelo LoRA a fusionar\",\n  \"Maximum Audio Duration\": \"Duración máxima de audio\",\n  \"Maximum Length per Sample\": \"Longitud Máxima por Muestra\",\n  \"Maximum Training Steps\": \"Pasos Máximos de Entrenamiento\",\n  \"Maximum tokens per batch, 0 means no limit\": \"Máximo de tokens por lote, 0 significa sin límite\",\n  \"Merge\": \"Fusionar\",\n  \"Merge LoRA\": \"Fusionar LoRA\",\n  \"Merge successfully\": \"Fusionado exitosamente\",\n  \"Minimum Audio Duration\": \"Duración mínima de audio\",\n  \"Model Output Path\": \"Ruta de Salida del Modelo\",\n  \"Model Size\": \"Tamaño del Modelo\",\n  \"Move\": \"Mover\",\n  \"Move files successfully\": \"Archivos movidos exitosamente\",\n  \"No audio generated, please check the input text.\": \"No se generó audio, por favor verifique el texto de entrada.\",\n  \"No selected options\": \"No hay opciones seleccionadas\",\n  \"Number of Workers\": \"Número de Trabajadores\",\n  \"Open Inference Server\": \"Abrir Servidor de Inferencia\",\n  \"Open Labeler WebUI\": \"Abrir Interfaz Web del Etiquetador\",\n  \"Open Tensorboard\": \"Abrir Tensorboard\",\n  \"Opened labeler in browser\": \"Se abrió el etiquetador en el navegador\",\n  \"Optional Label Language\": \"Idioma de Etiquetado Opcional\",\n  \"Optional online ver\": \"Ver en línea opcional\",\n  \"Output Path\": \"Ruta de Salida\",\n  \"Path error, please check the model file exists in the corresponding path\": \"Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente\",\n  \"Precision\": \"Precisión\",\n  \"Probability of applying Speaker Condition\": \"Probabilidad de aplicar Condición de Hablante\",\n  \"Put your text here.\": \"Ponga su texto aquí.\",\n  \"Reference Audio\": \"Audio de Referencia\",\n  \"Reference Text\": \"Texto de Referencia\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"El código relacionado y los pesos se publican bajo la FISH AUDIO RESEARCH LICENSE.\",\n  \"Remove Selected Data\": \"Eliminar Datos Seleccionados\",\n  \"Removed path successfully!\": \"¡Ruta eliminada exitosamente!\",\n  \"Repetition Penalty\": \"Penalización por Repetición\",\n  \"Save model every n steps\": \"Guardar modelo cada n pasos\",\n  \"Select LLAMA ckpt\": \"Seleccionar punto de control LLAMA\",\n  \"Select VITS ckpt\": \"Seleccionar punto de control VITS\",\n  \"Select VQGAN ckpt\": \"Seleccionar punto de control VQGAN\",\n  \"Select source file processing method\": \"Seleccione el método de procesamiento de archivos fuente\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)\",\n  \"Selected: {}\": \"Seleccionado: {}\",\n  \"Speaker\": \"Hablante\",\n  \"Speaker is identified by the folder name\": \"El hablante se identifica por el nombre de la carpeta\",\n  \"Start Training\": \"Iniciar Entrenamiento\",\n  \"Streaming Audio\": \"transmisión de audio\",\n  \"Streaming Generate\": \"síntesis en flujo\",\n  \"Tensorboard Host\": \"Host de Tensorboard\",\n  \"Tensorboard Log Path\": \"Ruta de Registro de Tensorboard\",\n  \"Tensorboard Port\": \"Puerto de Tensorboard\",\n  \"Tensorboard interface is closed\": \"La interfaz de Tensorboard está cerrada\",\n  \"Tensorboard interface is launched at {}\": \"La interfaz de Tensorboard se ha lanzado en {}\",\n  \"Text is too long, please keep it under {} characters.\": \"El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.\",\n  \"Training Configuration\": \"Configuración de Entrenamiento\",\n  \"Training Error\": \"Error de Entrenamiento\",\n  \"Training stopped\": \"Entrenamiento detenido\",\n  \"Type name of the speaker\": \"Escriba el nombre del hablante\",\n  \"Type the path or select from the dropdown\": \"Escriba la ruta o seleccione de la lista desplegable\",\n  \"Use LoRA\": \"Usar LoRA\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo\",\n  \"Use filelist\": \"Usar lista de archivos\",\n  \"Use large for 10G+ GPU, medium for 5G, small for 2G\": \"Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G\",\n  \"VITS Configuration\": \"Configuración de VITS\",\n  \"VQGAN Configuration\": \"Configuración de VQGAN\",\n  \"Validation Batch Size\": \"Tamaño del Lote de Validación\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.\",\n  \"WebUI Host\": \"Host de WebUI\",\n  \"WebUI Port\": \"Puerto de WebUI\",\n  \"Whisper Model\": \"Modelo Whisper\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1.5).\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+\",\n  \"latest\": \"más reciente\",\n  \"new\": \"nuevo\",\n  \"Realtime Transform Text\": \"Transformación de Texto en Tiempo Real\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"Vista Previa del Resultado de Normalización (Actualmente Solo Chino)\",\n  \"Text Normalization\": \"Normalización de Texto\",\n  \"Select Example Audio\": \"Selecionar áudio de exemplo\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/locale/ja_JP.json",
    "content": "{\n  \"16-mixed is recommended for 10+ series GPU\": \"10シリーズ以降のGPUには16-mixedをお勧めします\",\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。\",\n  \"Accumulate Gradient Batches\": \"勾配バッチの累積\",\n  \"Add to Processing Area\": \"処理エリアに追加\",\n  \"Added path successfully!\": \"パスの追加に成功しました！\",\n  \"Advanced Config\": \"詳細設定\",\n  \"Base LLAMA Model\": \"基本LLAMAモデル\",\n  \"Batch Inference\": \"バッチ推論\",\n  \"Batch Size\": \"バッチサイズ\",\n  \"Changing with the Model Path\": \"モデルのパスに伴って変化する\",\n  \"Chinese\": \"中国語\",\n  \"Compile Model\": \"モデルのコンパイル\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります\",\n  \"Copy\": \"コピー\",\n  \"Data Preprocessing\": \"データ前処理\",\n  \"Data Preprocessing Path\": \"データ前処理パス\",\n  \"Data Source\": \"データソース\",\n  \"Decoder Model Config\": \"デコーダーモデルの構成\",\n  \"Decoder Model Path\": \"デコーダーモデルのパス\",\n  \"Disabled\": \"無効\",\n  \"Enable Reference Audio\": \"リファレンスオーディオを有効にする\",\n  \"English\": \"英語\",\n  \"Error Message\": \"エラーメッセージ\",\n  \"File Preprocessing\": \"文書前处理\",\n  \"Generate\": \"生成\",\n  \"Generated Audio\": \"生成されたオーディオ\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています\",\n  \"Infer interface is closed\": \"推論インターフェースが閉じられています\",\n  \"Inference Configuration\": \"推論設定\",\n  \"Inference Server Configuration\": \"推論サーバー設定\",\n  \"Inference Server Error\": \"推論サーバーエラー\",\n  \"Inferring interface is launched at {}\": \"推論インターフェースが{}で起動しました\",\n  \"Initial Learning Rate\": \"初期学習率\",\n  \"Input Audio & Source Path for Transcription\": \"入力オーディオと文字起こしのソースパス\",\n  \"Input Text\": \"入力テキスト\",\n  \"Invalid path: {}\": \"無効なパス: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください\",\n  \"Iterative Prompt Length, 0 means off\": \"反復プロンプト長。0はオフを意味します\",\n  \"Japanese\": \"日本語\",\n  \"LLAMA Configuration\": \"LLAMA設定\",\n  \"LLAMA Model Config\": \"LLAMAモデル設定\",\n  \"LLAMA Model Path\": \"LLAMAモデルパス\",\n  \"Labeling Device\": \"ラベリングデバイス\",\n  \"LoRA Model to be merged\": \"マージするLoRAモデル\",\n  \"Maximum Audio Duration\": \"最大オーディオの長さ\",\n  \"Maximum Length per Sample\": \"サンプルあたりの最大長\",\n  \"Maximum Training Steps\": \"最大トレーニングステップ数\",\n  \"Maximum tokens per batch, 0 means no limit\": \"バッチあたりの最大トークン数。0は制限なしを意味します\",\n  \"Merge\": \"マージ\",\n  \"Merge LoRA\": \"LoRAのマージ\",\n  \"Merge successfully\": \"マージに成功しました\",\n  \"Minimum Audio Duration\": \"最小オーディオの長さ\",\n  \"Model Output Path\": \"モデル出力パス\",\n  \"Model Size\": \"モデルサイズ\",\n  \"Move\": \"移動\",\n  \"Move files successfully\": \"ファイルの移動に成功しました\",\n  \"No audio generated, please check the input text.\": \"オーディオが生成されていません。入力テキストを確認してください。\",\n  \"No selected options\": \"選択されたオプションはありません\",\n  \"Number of Workers\": \"ワーカー数\",\n  \"Open Inference Server\": \"推論サーバーを開く\",\n  \"Open Labeler WebUI\": \"ラベラーWebUIを開く\",\n  \"Open Tensorboard\": \"Tensorboardを開く\",\n  \"Opened labeler in browser\": \"ブラウザでラベラーを開きました\",\n  \"Optional Label Language\": \"オプションのラベル言語\",\n  \"Optional online ver\": \"オプションのオンラインバージョン\",\n  \"Output Path\": \"出力パス\",\n  \"Path error, please check the model file exists in the corresponding path\": \"パスエラー。対応するパスにモデルファイルが存在するか確認してください\",\n  \"Precision\": \"精度\",\n  \"Probability of applying Speaker Condition\": \"話者条件を適用する確率\",\n  \"Put your text here.\": \"ここにテキストを入力してください。\",\n  \"Reference Audio\": \"リファレンスオーディオ\",\n  \"Reference Text\": \"リファレンステキスト\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"関連コードと重みはFISH AUDIO RESEARCH LICENSEの下でリリースされます。\",\n  \"Remove Selected Data\": \"選択したデータを削除\",\n  \"Removed path successfully!\": \"パスの削除に成功しました！\",\n  \"Repetition Penalty\": \"反復ペナルティ\",\n  \"Save model every n steps\": \"nステップごとにモデルを保存\",\n  \"Select LLAMA ckpt\": \" LLAMA チェックポイントを選択\",\n  \"Select VITS ckpt\": \"VITS チェックポイントを選択\",\n  \"Select VQGAN ckpt\": \"VQGAN チェックポイントを選択\",\n  \"Select source file processing method\": \"ソースファイルの処理方法を選択\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"タブページに応じてトレーニングするモデルを選択してください\",\n  \"Selected: {}\": \"選択済み: {}\",\n  \"Speaker\": \"話者\",\n  \"Speaker is identified by the folder name\": \"話者はフォルダ名で識別されます\",\n  \"Start Training\": \"トレーニング開始\",\n  \"Streaming Audio\": \"ストリーミングオーディオ\",\n  \"Streaming Generate\": \"ストリーミング合成\",\n  \"Tensorboard Host\": \"Tensorboardホスト\",\n  \"Tensorboard Log Path\": \"Tensorboardログパス\",\n  \"Tensorboard Port\": \"Tensorboardポート\",\n  \"Tensorboard interface is closed\": \"Tensorboardインターフェースが閉じられています\",\n  \"Tensorboard interface is launched at {}\": \"Tensorboardインターフェースが{}で起動されました\",\n  \"Text is too long, please keep it under {} characters.\": \"テキストが長すぎます。{}文字以内に抑えてください。\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。\",\n  \"Training Configuration\": \"トレーニング設定\",\n  \"Training Error\": \"トレーニングエラー\",\n  \"Training stopped\": \"トレーニングが停止しました\",\n  \"Type name of the speaker\": \"話者の名前を入力\",\n  \"Type the path or select from the dropdown\": \"パスを入力するか、ドロップダウンから選択してください\",\n  \"Use LoRA\": \"LoRAを使用\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります\",\n  \"Use filelist\": \"ファイルリストを使用\",\n  \"Use large for 10G+ GPU, medium for 5G, small for 2G\": \"10G以上のGPUには大、5Gには中、2Gには小を使用してください\",\n  \"VITS Configuration\": \"VITS の構成\",\n  \"VQGAN Configuration\": \"VQGAN の構成\",\n  \"Validation Batch Size\": \"検証バッチサイズ\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。\",\n  \"WebUI Host\": \"WebUIホスト\",\n  \"WebUI Port\": \"WebUIポート\",\n  \"Whisper Model\": \"Whisperモデル\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1.5)にあります。\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします\",\n  \"latest\": \"最新\",\n  \"new\": \"新規\",\n  \"Realtime Transform Text\": \"リアルタイム変換テキスト\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"正規化結果プレビュー（現在は中国語のみ）\",\n  \"Text Normalization\": \"テキスト正規化\",\n  \"Select Example Audio\": \"サンプル音声を選択\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/locale/ko_KR.json",
    "content": "{\n  \"16-mixed is recommended for 10+ series GPU\": \"10+ 시리즈 GPU에는 16-mixed를 권장합니다.\",\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.\",\n  \"Accumulate Gradient Batches\": \"그라디언트 배치 누적\",\n  \"Add to Processing Area\": \"처리 영역에 추가\",\n  \"Added path successfully!\": \"경로가 성공적으로 추가되었습니다!\",\n  \"Advanced Config\": \"고급 설정\",\n  \"Base LLAMA Model\": \"기본 LLAMA 모델\",\n  \"Batch Inference\": \"배치 추론\",\n  \"Batch Size\": \"배치 크기\",\n  \"Changing with the Model Path\": \"모델 경로에 따라 변경 중\",\n  \"Chinese\": \"중국어\",\n  \"Compile Model\": \"모델 컴파일\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.\",\n  \"Copy\": \"복사\",\n  \"Data Preprocessing\": \"데이터 전처리\",\n  \"Data Preprocessing Path\": \"데이터 전처리 경로\",\n  \"Data Source\": \"데이터 소스\",\n  \"Decoder Model Config\": \"디코더 모델 설정\",\n  \"Decoder Model Path\": \"디코더 모델 경로\",\n  \"Disabled\": \"비활성화 됨\",\n  \"Enable Reference Audio\": \"참고 음성 활성화\",\n  \"English\": \"영어\",\n  \"Error Message\": \"오류 메시지\",\n  \"File Preprocessing\": \"파일 전처리\",\n  \"Generate\": \"생성\",\n  \"Generated Audio\": \"생성된 오디오\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.\",\n  \"Infer interface is closed\": \"추론 인터페이스가 닫혔습니다.\",\n  \"Inference Configuration\": \"추론 설정\",\n  \"Inference Server Configuration\": \"추론 서버 설정\",\n  \"Inference Server Error\": \"추론 서버 오류\",\n  \"Inferring interface is launched at {}\": \"추론 인터페이스가 {}에서 시작되었습니다.\",\n  \"Initial Learning Rate\": \"초기 학습률\",\n  \"Input Audio & Source Path for Transcription\": \"전사할 입력 오디오 및 소스 경로\",\n  \"Input Text\": \"입력 텍스트\",\n  \"Invalid path: {}\": \"유효하지 않은 경로: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.\",\n  \"Iterative Prompt Length, 0 means off\": \"반복 프롬프트 길이. (0:비활성화)\",\n  \"Japanese\": \"일본어\",\n  \"LLAMA Configuration\": \"LLAMA 설정\",\n  \"LLAMA Model Config\": \"LLAMA 모델 설정\",\n  \"LLAMA Model Path\": \"LLAMA 모델 경로\",\n  \"Labeling Device\": \"라벨링 장치\",\n  \"LoRA Model to be merged\": \"병합할 LoRA 모델\",\n  \"Maximum Audio Duration\": \"최대 오디오 길이\",\n  \"Maximum Length per Sample\": \"샘플당 최대 길이\",\n  \"Maximum Training Steps\": \"최대 학습 단계\",\n  \"Maximum tokens per batch, 0 means no limit\": \"배치당 최대 토큰 수(0:제한 없음)\",\n  \"Merge\": \"병합\",\n  \"Merge LoRA\": \"LoRA 병합\",\n  \"Merge successfully\": \"성공적으로 병합 되었습니다.\",\n  \"Minimum Audio Duration\": \"최소 오디오 길이\",\n  \"Model Output Path\": \"모델 출력 경로\",\n  \"Model Size\": \"모델 크기\",\n  \"Move\": \"이동\",\n  \"Move files successfully\": \"파일이 성공적으로 이동되었습니다.\",\n  \"No audio generated, please check the input text.\": \"생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.\",\n  \"No selected options\": \"옵션이 선택되지 않았습니다.\",\n  \"Number of Workers\": \"작업자 수\",\n  \"Open Inference Server\": \"추론 서버 열기\",\n  \"Open Labeler WebUI\": \"라벨러 WebUI 열기\",\n  \"Open Tensorboard\": \"Tensorboard 열기\",\n  \"Opened labeler in browser\": \"브라우저에서 라벨러가 열렸습니다.\",\n  \"Optional Label Language\": \"선택적 라벨 언어\",\n  \"Optional online ver\": \"온라인 버전 선택\",\n  \"Output Path\": \"출력 경로\",\n  \"Path error, please check the model file exists in the corresponding path\": \"경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.\",\n  \"Precision\": \"정밀도\",\n  \"Probability of applying Speaker Condition\": \"화자 조건 적용 확률\",\n  \"Put your text here.\": \"여기에 텍스트를 입력하세요.\",\n  \"Reference Audio\": \"참고 오디오\",\n  \"Reference Text\": \"참고 텍스트\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"관련 코드 및 가중치는 FISH AUDIO RESEARCH LICENSE 하에 배포됩니다.\",\n  \"Remove Selected Data\": \"선택한 데이터 제거\",\n  \"Removed path successfully!\": \"경로가 성공적으로 제거되었습니다!\",\n  \"Repetition Penalty\": \"반복 패널티\",\n  \"Save model every n steps\": \"n 단계마다 모델 저장\",\n  \"Select LLAMA ckpt\": \"LLAMA ckpt 선택\",\n  \"Select VITS ckpt\": \"VITS ckpt 선택\",\n  \"Select VQGAN ckpt\": \"VQGAN ckpt 선택\",\n  \"Select source file processing method\": \"소스 파일 처리 방법 선택\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"학습할 모델 선택(탭 페이지에 따라 다름)\",\n  \"Selected: {}\": \"선택됨: {}\",\n  \"Speaker\": \"화자\",\n  \"Speaker is identified by the folder name\": \"화자는 폴더 이름으로 식별됩니다\",\n  \"Start Training\": \"학습 시작\",\n  \"Streaming Audio\": \"스트리밍 오디오\",\n  \"Streaming Generate\": \"스트리밍 생성\",\n  \"Tensorboard Host\": \"Tensorboard 호스트\",\n  \"Tensorboard Log Path\": \"Tensorboard 로그 경로\",\n  \"Tensorboard Port\": \"Tensorboard 포트\",\n  \"Tensorboard interface is closed\": \"Tensorboard 인터페이스가 닫혔습니다\",\n  \"Tensorboard interface is launched at {}\": \"Tensorboard 인터페이스가 {}에서 시작되었습니다.\",\n  \"Text is too long, please keep it under {} characters.\": \"텍스트가 너무 깁니다. {}자 이하로 입력해주세요.\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.\",\n  \"Training Configuration\": \"학습 설정\",\n  \"Training Error\": \"학습 오류\",\n  \"Training stopped\": \"학습이 중지되었습니다.\",\n  \"Type name of the speaker\": \"화자의 이름을 입력하세요.\",\n  \"Type the path or select from the dropdown\": \"경로를 입력하거나 드롭다운에서 선택하세요.\",\n  \"Use LoRA\": \"LoRA 사용\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.\",\n  \"Use filelist\": \"파일 목록 사용\",\n  \"Use large for 10G+ GPU, medium for 5G, small for 2G\": \"10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.\",\n  \"VITS Configuration\": \"VITS 설정\",\n  \"VQGAN Configuration\": \"VQGAN 설정\",\n  \"Validation Batch Size\": \"검증 배치 크기\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.\",\n  \"WebUI Host\": \"WebUI 호스트\",\n  \"WebUI Port\": \"WebUI 포트\",\n  \"Whisper Model\": \"Whisper 모델\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1.5)에서 확인하실 수 있습니다.\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다\",\n  \"latest\": \"최신\",\n  \"new\": \"새로운\",\n  \"Realtime Transform Text\": \"실시간 텍스트 변환\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"정규화 결과 미리보기(현재 중국어만 지원)\",\n  \"Text Normalization\": \"텍스트 정규화\",\n  \"Select Example Audio\": \"예시 오디오 선택\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/locale/pt_BR.json",
    "content": "{\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"5 a 10 segundos de áudio de referência, útil para especificar o orador.\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).\",\n  \"Accumulate Gradient Batches\": \"Acumular Lotes de Gradiente\",\n  \"Add to Processing Area\": \"Adicionar à Área de Processamento\",\n  \"Added path successfully!\": \"Caminho adicionado com sucesso!\",\n  \"Advanced Config\": \"Configuração Avançada\",\n  \"Base LLAMA Model\": \"Modelo LLAMA Base\",\n  \"Batch Inference\": \"Inferência em Lote\",\n  \"Batch Size\": \"Tamanho do Lote\",\n  \"Changing with the Model Path\": \"Alterando com o Caminho do Modelo\",\n  \"Compile Model\": \"Compilar Modelo\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial\",\n  \"Copy\": \"Copiar\",\n  \"Data Preprocessing\": \"Pré-processamento de Dados\",\n  \"Data Preprocessing Path\": \"Caminho de Pré-processamento de Dados\",\n  \"Data Source\": \"Fonte de Dados\",\n  \"Decoder Model Config\": \"Configuração do Modelo Decodificador\",\n  \"Decoder Model Path\": \"Caminho do Modelo Decodificador\",\n  \"Disabled\": \"Desativado\",\n  \"Enable Initial Prompt\": \"Habilitar Prompt Inicial\",\n  \"Enable Reference Audio\": \"Habilitar Áudio de Referência\",\n  \"English\": \"Inglês\",\n  \"Japanese\": \"Japonês\",\n  \"Chinese\": \"Chinês\",\n  \"Portuguese\": \"Português\",\n  \"Spanish\": \"Espanhol\",\n  \"Error Message\": \"Mensagem de Erro\",\n  \"Faster Whisper, Up to 5g GPU memory usage\": \"Faster Whisper (Usa até 5 GB de vRAM)\",\n  \"File Preprocessing\": \"Pré-processamento de Arquivos\",\n  \"Generate\": \"Gerar\",\n  \"Generated Audio\": \"Áudio Gerado\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)\",\n  \"Infer interface is closed\": \"A interface de inferência foi fechada\",\n  \"Inference Configuration\": \"Configuração de Inferência\",\n  \"Inference Server Configuration\": \"Configuração do Servidor de Inferência\",\n  \"Inference Server Error\": \"Erro do Servidor de Inferência\",\n  \"Inferring interface is launched at {}\": \"A interface de inferência foi iniciada em {}\",\n  \"Initial Learning Rate\": \"Taxa de Aprendizagem Inicial\",\n  \"Initial Prompt\": \"Prompt Inicial\",\n  \"Initial prompt can provide contextual or vocabulary-specific guidance to the model.\": \"O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.\",\n  \"Input Audio & Source Path for Transcription\": \"Entrada de Áudio/Caminho de Origem para Transcrição\",\n  \"Input Text\": \"Texto de Entrada\",\n  \"Invalid path: {}\": \"Caminho inválido: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU\",\n  \"Iterative Prompt Length, 0 means off\": \"Comprimento do Prompt Iterativo (0 = desativado)\",\n  \"LLAMA Configuration\": \"Configuração do LLAMA\",\n  \"LLAMA Model Config\": \"Configuração do Modelo LLAMA\",\n  \"LLAMA Model Path\": \"Caminho do Modelo LLAMA\",\n  \"Labeling Device\": \"Dispositivo de Rotulagem\",\n  \"LoRA Model to be merged\": \"Modelo LoRA para mesclagem\",\n  \"Maximum Length per Sample\": \"Comprimento Máximo por Amostra\",\n  \"Maximum Training Steps\": \"Etapas Máximas de Treinamento\",\n  \"Maximum tokens per batch, 0 means no limit\": \"Número máximo de tokens por lote, 0 significa sem limite\",\n  \"Merge\": \"Mesclar\",\n  \"Merge LoRA\": \"Mesclar LoRA\",\n  \"Merge successfully\": \"Mesclado com sucesso\",\n  \"Model Output Path\": \"Caminho de Saída do Modelo\",\n  \"Model Quantization\": \"Quantização do Modelo\",\n  \"Model Size\": \"Tamanho do Modelo\",\n  \"Move\": \"Mover\",\n  \"Move files successfully\": \"Arquivos movidos com sucesso\",\n  \"No audio generated, please check the input text.\": \"Nenhum áudio gerado, verifique o texto de entrada.\",\n  \"No selected options\": \"Nenhuma opção selecionada\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)\",\n  \"Number of Workers\": \"Número de Processos\",\n  \"Open Inference Server\": \"Abrir Servidor de Inferência\",\n  \"Open Labeler WebUI\": \"Abrir WebUI de Rotulagem\",\n  \"Open Tensorboard\": \"Abrir Tensorboard\",\n  \"Opened labeler in browser\": \"WebUI de rotulagem aberta no navegador\",\n  \"Optional Label Language\": \"Idioma do Rótulo (Opcional)\",\n  \"Optional online ver\": \"Versão online (opcional)\",\n  \"Output Path\": \"Caminho de Saída\",\n  \"Path error, please check the model file exists in the corresponding path\": \"Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente\",\n  \"Post-quantification Precision\": \"Precisão Pós-quantização\",\n  \"Precision\": \"Precisão\",\n  \"Probability of applying Speaker Condition\": \"Probabilidade de Aplicar Condição de Orador\",\n  \"Put your text here.\": \"Insira seu texto aqui.\",\n  \"Quantify\": \"Quantizar\",\n  \"Quantify successfully\": \"Quantizado com sucesso\",\n  \"Realtime Transform Text\": \"Transformar Texto em Tempo Real\",\n  \"Reference Audio\": \"Áudio de Referência\",\n  \"Reference Text\": \"Texto de Referência\",\n  \"warning\": \"Aviso\",\n  \"Pre-processing begins...\": \"O pré-processamento começou!\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"O código relacionado e os pesos são licenciados sob a FISH AUDIO RESEARCH LICENSE.\",\n  \"Remove Selected Data\": \"Remover Dados Selecionados\",\n  \"Removed path successfully!\": \"Caminho removido com sucesso!\",\n  \"Repetition Penalty\": \"Penalidade de Repetição\",\n  \"Save model every n steps\": \"Salvar modelo a cada n etapas\",\n  \"Select LLAMA ckpt\": \"Selecionar .ckpt do LLAMA\",\n  \"Select source file processing method\": \"Escolha como processar o arquivo de origem\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"Selecione o modelo para o treinamento (dependendo da aba em que você está)\",\n  \"Selected: {}\": \"Selecionado: {}\",\n  \"Speaker is identified by the folder name\": \"O orador é identificado pelo nome da pasta\",\n  \"Start Training\": \"Iniciar Treinamento\",\n  \"Streaming Audio\": \"Áudio em Streaming\",\n  \"Streaming Generate\": \"Geração em Streaming\",\n  \"Tensorboard Host\": \"Host do Tensorboard\",\n  \"Tensorboard Log Path\": \"Caminho de Log do Tensorboard\",\n  \"Tensorboard Port\": \"Porta do Tensorboard\",\n  \"Tensorboard interface is closed\": \"A interface do Tensorboard está fechada\",\n  \"Tensorboard interface is launched at {}\": \"A interface do Tensorboard foi iniciada em {}\",\n  \"Text Normalization\": \"Normalização de Texto\",\n  \"Text is too long, please keep it under {} characters.\": \"O texto é muito longo. Mantenha-o com menos de {} caracteres.\",\n  \"The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase\": \"Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.\",\n  \"Training Configuration\": \"Configuração de Treinamento\",\n  \"Training Error\": \"Erro de Treinamento\",\n  \"Training stopped\": \"Treinamento interrompido!\",\n  \"Type the path or select from the dropdown\": \"Digite o caminho ou selecione no menu suspenso\",\n  \"Use LoRA\": \"Usar LoRA\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade\",\n  \"Use filelist\": \"Usar lista de arquivos\",\n  \"VQGAN Configuration\": \"Configuração do VQGAN\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.\",\n  \"WebUI Host\": \"Host da WebUI\",\n  \"WebUI Port\": \"Porta da WebUI\",\n  \"Whisper Model\": \"Modelo Whisper\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1.5).\",\n  \"auto\": \"automático\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+\",\n  \"latest\": \"mais recente\",\n  \"new\": \"novo\",\n  \"This audio introduces the basic concepts and applications of artificial intelligence and machine learning.\": \"Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.\",\n  \"You don't need to train this model!\": \"Não é necessário treinar este modelo!\",\n  \"Yes\": \"Sim\",\n  \"No\": \"Não\",\n  \"version:\": \"versão:\",\n  \"author:\": \"autor:\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/locale/zh_CN.json",
    "content": "{\n  \"16-mixed is recommended for 10+ series GPU\": \"10+ 系列 GPU 建议使用 16-mixed\",\n  \"5 to 10 seconds of reference audio, useful for specifying speaker.\": \"5 到 10 秒的参考音频，适用于指定音色。\",\n  \"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\": \"由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.\",\n  \"Accumulate Gradient Batches\": \"梯度累积批次\",\n  \"Add to Processing Area\": \"加入处理区\",\n  \"Added path successfully!\": \"添加路径成功!\",\n  \"Advanced Config\": \"高级参数\",\n  \"Base LLAMA Model\": \"基础 LLAMA 模型\",\n  \"Batch Inference\": \"批量推理\",\n  \"Batch Size\": \"批次大小\",\n  \"Changing with the Model Path\": \"随模型路径变化\",\n  \"Chinese\": \"中文\",\n  \"Compile Model\": \"编译模型\",\n  \"Compile the model can significantly reduce the inference time, but will increase cold start time\": \"编译模型可以显著减少推理时间，但会增加冷启动时间\",\n  \"Copy\": \"复制\",\n  \"Data Preprocessing\": \"数据预处理\",\n  \"Data Preprocessing Path\": \"数据预处理路径\",\n  \"Data Source\": \"数据源\",\n  \"Decoder Model Config\": \"解码器模型配置\",\n  \"Decoder Model Path\": \"解码器模型路径\",\n  \"Disabled\": \"禁用\",\n  \"Enable Reference Audio\": \"启用参考音频\",\n  \"English\": \"英文\",\n  \"Error Message\": \"错误信息\",\n  \"File Preprocessing\": \"文件预处理\",\n  \"Generate\": \"生成\",\n  \"Generated Audio\": \"音频\",\n  \"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format\": \"如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式\",\n  \"Infer interface is closed\": \"推理界面已关闭\",\n  \"Inference Configuration\": \"推理配置\",\n  \"Inference Server Configuration\": \"推理服务器配置\",\n  \"Inference Server Error\": \"推理服务器错误\",\n  \"Inferring interface is launched at {}\": \"推理界面已在 {} 上启动\",\n  \"Initial Learning Rate\": \"初始学习率\",\n  \"Input Audio & Source Path for Transcription\": \"输入音频和转录源路径\",\n  \"Input Text\": \"输入文本\",\n  \"Invalid path: {}\": \"无效路径: {}\",\n  \"It is recommended to use CUDA, if you have low configuration, use CPU\": \"建议使用 CUDA，如果配置较低，使用 CPU\",\n  \"Iterative Prompt Length, 0 means off\": \"迭代提示长度，0 表示关闭\",\n  \"Japanese\": \"日文\",\n  \"LLAMA Configuration\": \"LLAMA 配置\",\n  \"LLAMA Model Config\": \"LLAMA 模型配置\",\n  \"LLAMA Model Path\": \"LLAMA 模型路径\",\n  \"Labeling Device\": \"标注加速设备\",\n  \"LoRA Model to be merged\": \"要合并的 LoRA 模型\",\n  \"Maximum Audio Duration\": \"最大音频时长\",\n  \"Maximum Length per Sample\": \"每个样本的最大长度\",\n  \"Maximum Training Steps\": \"最大训练步数\",\n  \"Maximum tokens per batch, 0 means no limit\": \"每批最大令牌数，0 表示无限制\",\n  \"Merge\": \"合并\",\n  \"Merge LoRA\": \"合并 LoRA\",\n  \"Merge successfully\": \"合并成功\",\n  \"Minimum Audio Duration\": \"最小音频时长\",\n  \"Model Output Path\": \"模型输出路径\",\n  \"Model Size\": \"模型规模\",\n  \"Move\": \"移动\",\n  \"Move files successfully\": \"移动文件成功\",\n  \"No audio generated, please check the input text.\": \"没有生成音频，请检查输入文本.\",\n  \"No selected options\": \"没有选择的选项\",\n  \"Number of Workers\": \"数据加载进程数\",\n  \"Open Inference Server\": \"打开推理服务器\",\n  \"Open Labeler WebUI\": \"打开标注工具\",\n  \"Open Tensorboard\": \"打开 Tensorboard\",\n  \"Opened labeler in browser\": \"在浏览器中打开标注工具\",\n  \"Optional Label Language\": \"[可选] 标注语言\",\n  \"Optional online ver\": \"[可选] 使用在线版\",\n  \"Output Path\": \"输出路径\",\n  \"Path error, please check the model file exists in the corresponding path\": \"路径错误，请检查模型文件是否存在于相应路径\",\n  \"Precision\": \"精度\",\n  \"Probability of applying Speaker Condition\": \"应用说话人条件的概率\",\n  \"Put your text here.\": \"在此处输入文本.\",\n  \"Reference Audio\": \"参考音频\",\n  \"Reference Text\": \"参考文本\",\n  \"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\": \"相关代码和权重使用 FISH AUDIO RESEARCH LICENSE 许可证发布.\",\n  \"Remove Selected Data\": \"移除选中数据\",\n  \"Removed path successfully!\": \"移除路径成功!\",\n  \"Repetition Penalty\": \"重复惩罚\",\n  \"Save model every n steps\": \"每 n 步保存模型\",\n  \"Select LLAMA ckpt\": \"选择 LLAMA 检查点\",\n  \"Select VITS ckpt\": \"选择 VITS 检查点\",\n  \"Select VQGAN ckpt\": \"选择 VQGAN 检查点\",\n  \"Select source file processing method\": \"选择源文件处理方法\",\n  \"Select the model to be trained (Depending on the Tab page you are on)\": \"根据您所在的选项卡页面选择要训练的模型\",\n  \"Selected: {}\": \"已选择: {}\",\n  \"Speaker\": \"说话人\",\n  \"Speaker is identified by the folder name\": \"自动根据父目录名称识别说话人\",\n  \"Start Training\": \"开始训练\",\n  \"Streaming Audio\": \"流式音频\",\n  \"Streaming Generate\": \"流式合成\",\n  \"Tensorboard Host\": \"Tensorboard 监听地址\",\n  \"Tensorboard Log Path\": \"Tensorboard 日志路径\",\n  \"Tensorboard Port\": \"Tensorboard 端口\",\n  \"Tensorboard interface is closed\": \"Tensorboard 界面已关闭\",\n  \"Tensorboard interface is launched at {}\": \"Tensorboard 界面已在 {} 上启动\",\n  \"Text is too long, please keep it under {} characters.\": \"文本太长，请保持在 {} 个字符以内.\",\n  \"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.\": \"左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.\",\n  \"Training Configuration\": \"训练配置\",\n  \"Training Error\": \"训练错误\",\n  \"Training stopped\": \"训练已停止\",\n  \"Type name of the speaker\": \"输入说话人的名称\",\n  \"Type the path or select from the dropdown\": \"输入路径或从下拉菜单中选择\",\n  \"Use LoRA\": \"使用 LoRA\",\n  \"Use LoRA can save GPU memory, but may reduce the quality of the model\": \"使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量\",\n  \"Use filelist\": \"使用文件列表\",\n  \"Use large for 10G+ GPU, medium for 5G, small for 2G\": \"10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small\",\n  \"VITS Configuration\": \"VITS 配置\",\n  \"VQGAN Configuration\": \"VQGAN 配置\",\n  \"Validation Batch Size\": \"验证批次大小\",\n  \"View the status of the preprocessing folder (use the slider to control the depth of the tree)\": \"查看预处理文件夹的状态 (使用滑块控制树的深度)\",\n  \"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\": \"我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.\",\n  \"WebUI Host\": \"WebUI 监听地址\",\n  \"WebUI Port\": \"WebUI 端口\",\n  \"Whisper Model\": \"Whisper 模型\",\n  \"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\": \"你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1.5) 找到模型.\",\n  \"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU\": \"30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed\",\n  \"latest\": \"最近的检查点\",\n  \"new\": \"创建新的检查点\",\n  \"Realtime Transform Text\": \"实时规范化文本\",\n  \"Normalization Result Preview (Currently Only Chinese)\": \"规范化结果预览\",\n  \"Text Normalization\": \"文本规范化\",\n  \"Select Example Audio\": \"选择参考音频\"\n}\n"
  },
  {
    "path": "fish_speech/i18n/scan.py",
    "content": "import ast\nimport glob\nimport json\nfrom collections import OrderedDict\nfrom pathlib import Path\n\nfrom loguru import logger\n\nfrom .core import DEFAULT_LANGUAGE, I18N_FILE_PATH\n\n\ndef extract_i18n_strings(node):\n    i18n_strings = []\n\n    if (\n        isinstance(node, ast.Call)\n        and isinstance(node.func, ast.Name)\n        and node.func.id == \"i18n\"\n    ):\n        for arg in node.args:\n            if isinstance(arg, ast.Str):\n                i18n_strings.append(arg.s)\n\n    for child_node in ast.iter_child_nodes(node):\n        i18n_strings.extend(extract_i18n_strings(child_node))\n\n    return i18n_strings\n\n\n# scan the directory for all .py files (recursively)\n# for each file, parse the code into an AST\n# for each AST, extract the i18n strings\n\nstrings = []\nfolders = [\"fish_speech\", \"tools\"]\n# for filename in glob.iglob(\"**/*.py\", recursive=True):\nfor folder in folders:\n    for f in Path(folder).rglob(\"*.py\"):\n        code = f.read_text(encoding=\"utf-8\")\n        if \"i18n(\" in code:\n            tree = ast.parse(code)\n            i18n_strings = extract_i18n_strings(tree)\n            logger.info(f\"Found {len(i18n_strings)} i18n strings in {f}\")\n            strings.extend(i18n_strings)\n\ncode_keys = set(strings)\nlogger.info(f\"Total unique: {len(code_keys)}\")\n\n\nstandard_file = I18N_FILE_PATH / f\"{DEFAULT_LANGUAGE}.json\"\nwith open(standard_file, \"r\", encoding=\"utf-8\") as f:\n    standard_data = json.load(f, object_pairs_hook=OrderedDict)\nstandard_keys = set(standard_data.keys())\n\n# Define the standard file name\nunused_keys = standard_keys - code_keys\nlogger.info(f\"Found {len(unused_keys)} unused keys in {standard_file}\")\nfor unused_key in unused_keys:\n    logger.info(f\"\\t{unused_key}\")\n\nmissing_keys = code_keys - standard_keys\nlogger.info(f\"Found {len(missing_keys)} missing keys in {standard_file}\")\nfor missing_key in missing_keys:\n    logger.info(f\"\\t{missing_key}\")\n\ncode_keys_dict = OrderedDict()\nfor s in strings:\n    code_keys_dict[s] = s\n\n# write back\nwith open(standard_file, \"w\", encoding=\"utf-8\") as f:\n    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)\n    f.write(\"\\n\")\n\nlogger.info(f\"Updated {standard_file}\")\n\n\n# Define the standard file name\nstandard_file = I18N_FILE_PATH / f\"{DEFAULT_LANGUAGE}.json\"\n\n# Find all JSON files in the directory\ndir_path = I18N_FILE_PATH\nlanguages = [f for f in dir_path.glob(\"*.json\") if f.stem != DEFAULT_LANGUAGE]\n\n# Load the standard file\nwith open(standard_file, \"r\", encoding=\"utf-8\") as f:\n    standard_data = json.load(f, object_pairs_hook=OrderedDict)\n\n# Loop through each language file\nfor lang_file in languages:\n    # Load the language file\n    with open(lang_file, \"r\", encoding=\"utf-8\") as f:\n        lang_data = json.load(f, object_pairs_hook=OrderedDict)\n\n    # Find the difference between the language file and the standard file\n    diff = set(standard_data.keys()) - set(lang_data.keys())\n\n    miss = set(lang_data.keys()) - set(standard_data.keys())\n\n    # Add any missing keys to the language file\n    for key in diff:\n        lang_data[key] = \"#!\" + key\n        logger.info(f\"Added missing key: {key} to {lang_file}\")\n\n    # Del any extra keys to the language file\n    for key in miss:\n        del lang_data[key]\n        logger.info(f\"Del extra key: {key} from {lang_file}\")\n\n    # Sort the keys of the language file to match the order of the standard file\n    lang_data = OrderedDict(\n        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))\n    )\n\n    # Save the updated language file\n    with open(lang_file, \"w\", encoding=\"utf-8\") as f:\n        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)\n        f.write(\"\\n\")\n\n    logger.info(f\"Updated {lang_file}\")\n\nlogger.info(\"Done\")\n"
  },
  {
    "path": "fish_speech/inference_engine/__init__.py",
    "content": "import gc\nimport queue\nfrom typing import Generator\n\nimport numpy as np\nimport torch\nfrom loguru import logger\n\nfrom fish_speech.inference_engine.reference_loader import ReferenceLoader\nfrom fish_speech.inference_engine.utils import InferenceResult, wav_chunk_header\nfrom fish_speech.inference_engine.vq_manager import VQManager\nfrom fish_speech.models.dac.modded_dac import DAC\nfrom fish_speech.models.text2semantic.inference import (\n    GenerateRequest,\n    GenerateResponse,\n    WrappedGenerateResponse,\n)\nfrom fish_speech.utils import autocast_exclude_mps, set_seed\nfrom fish_speech.utils.schema import ServeTTSRequest\n\n\nclass TTSInferenceEngine(ReferenceLoader, VQManager):\n\n    def __init__(\n        self,\n        llama_queue: queue.Queue,\n        decoder_model: DAC,\n        precision: torch.dtype,\n        compile: bool,\n    ) -> None:\n\n        super().__init__()\n\n        self.llama_queue = llama_queue\n        self.decoder_model = decoder_model\n        self.precision = precision\n        self.compile = compile\n\n    @torch.inference_mode()\n    def inference(self, req: ServeTTSRequest) -> Generator[InferenceResult, None, None]:\n        \"\"\"\n        Main inference function:\n        - Loads the reference audio and text.\n        - Calls the LLAMA model for inference.\n        - Decodes the VQ tokens to audio.\n        \"\"\"\n\n        ref_id: str | None = req.reference_id\n        prompt_tokens, prompt_texts = [], []\n        # Load the reference audio and text based on id or hash\n        if ref_id is not None:\n            prompt_tokens, prompt_texts = self.load_by_id(ref_id, req.use_memory_cache)\n\n        elif req.references:\n            prompt_tokens, prompt_texts = self.load_by_hash(\n                req.references, req.use_memory_cache\n            )\n\n        # Set the random seed if provided\n        if req.seed is not None:\n            set_seed(req.seed)\n            logger.warning(f\"set seed: {req.seed}\")\n\n        # Get the symbolic tokens from the LLAMA model\n        response_queue = self.send_Llama_request(req, prompt_tokens, prompt_texts)\n\n        # Get the sample rate from the decoder model\n        if hasattr(self.decoder_model, \"spec_transform\"):\n            sample_rate = self.decoder_model.spec_transform.sample_rate\n        else:\n            sample_rate = self.decoder_model.sample_rate\n\n        # If streaming, send the header\n        if req.streaming:\n            yield InferenceResult(\n                code=\"header\",\n                audio=(\n                    sample_rate,\n                    np.array(wav_chunk_header(sample_rate=sample_rate)),\n                ),\n                error=None,\n            )\n\n        segments = []\n\n        while True:\n            # Get the response from the LLAMA model\n            wrapped_result: WrappedGenerateResponse = response_queue.get()\n            if wrapped_result.status == \"error\":\n                yield InferenceResult(\n                    code=\"error\",\n                    audio=None,\n                    error=(\n                        wrapped_result.response\n                        if isinstance(wrapped_result.response, Exception)\n                        else Exception(\"Unknown error\")\n                    ),\n                )\n                break\n\n            # Check the response type\n            if not isinstance(wrapped_result.response, GenerateResponse):\n                raise TypeError(\n                    \"Expected GenerateResponse, got {type(wrapped_result.response).__name__}\"\n                )\n\n            result: GenerateResponse = wrapped_result.response\n            if result.action != \"next\":\n                segment = self.get_audio_segment(result)\n\n                if req.streaming:  # Used only by the API server\n                    yield InferenceResult(\n                        code=\"segment\",\n                        audio=(sample_rate, segment),\n                        error=None,\n                    )\n                segments.append(segment)\n            else:\n                break\n\n        # Clean up the memory\n        if torch.cuda.is_available():\n            torch.cuda.empty_cache()\n            gc.collect()\n\n        # Edge case: no audio generated\n        if len(segments) == 0:\n            yield InferenceResult(\n                code=\"error\",\n                audio=None,\n                error=RuntimeError(\"No audio generated, please check the input text.\"),\n            )\n        else:\n            # Streaming or not, return the final audio\n            audio = np.concatenate(segments, axis=0)\n            yield InferenceResult(\n                code=\"final\",\n                audio=(sample_rate, audio),\n                error=None,\n            )\n\n        return None\n\n    def send_Llama_request(\n        self, req: ServeTTSRequest, prompt_tokens: list, prompt_texts: list\n    ) -> queue.Queue:\n        \"\"\"\n        Send a request to the LLAMA model to generate the symbolic tokens.\n        \"\"\"\n\n        # Prepare the request\n        request = dict(\n            device=self.decoder_model.device,\n            max_new_tokens=req.max_new_tokens,\n            text=req.text,\n            top_p=req.top_p,\n            repetition_penalty=req.repetition_penalty,\n            temperature=req.temperature,\n            compile=self.compile,\n            iterative_prompt=req.chunk_length > 0,\n            chunk_length=req.chunk_length,\n            prompt_tokens=prompt_tokens,\n            prompt_text=prompt_texts,\n        )\n\n        # Create a queue to get the response\n        response_queue = queue.Queue()\n\n        # Send the request to the LLAMA model\n        self.llama_queue.put(\n            GenerateRequest(\n                request=request,\n                response_queue=response_queue,\n            )\n        )\n\n        return response_queue\n\n    def get_audio_segment(self, result: GenerateResponse) -> np.ndarray:\n        \"\"\"\n        Decode the VQ tokens to audio.\n        \"\"\"\n\n        # Don't use autocast on MPS devices\n        with autocast_exclude_mps(\n            device_type=self.decoder_model.device.type, dtype=self.precision\n        ):\n            # Decode the symbolic tokens to audio\n            segment = self.decode_vq_tokens(codes=result.codes)\n\n        # Convert the audio to numpy\n        return segment.float().cpu().numpy()\n"
  },
  {
    "path": "fish_speech/inference_engine/reference_loader.py",
    "content": "import io\nfrom hashlib import sha256\nfrom pathlib import Path\nfrom typing import Callable, Literal, Tuple\n\nimport torch\nimport torchaudio\nfrom loguru import logger\n\nfrom fish_speech.models.dac.modded_dac import DAC\nfrom fish_speech.utils.file import (\n    AUDIO_EXTENSIONS,\n    audio_to_bytes,\n    list_files,\n    read_ref_text,\n)\nfrom fish_speech.utils.schema import ServeReferenceAudio\n\n\nclass ReferenceLoader:\n    def __init__(self) -> None:\n        \"\"\"\n        Component of the TTSInferenceEngine class.\n        Loads and manages the cache for the reference audio and text.\n        \"\"\"\n        self.ref_by_id: dict = {}\n        self.ref_by_hash: dict = {}\n\n        # Make Pylance happy (attribut/method not defined...)\n        self.decoder_model: DAC\n        self.encode_reference: Callable\n\n        # Define the torchaudio backend\n        backends = torchaudio.list_audio_backends()\n        if \"ffmpeg\" in backends:\n            self.backend = \"ffmpeg\"\n        else:\n            self.backend = \"soundfile\"\n\n    def load_by_id(\n        self,\n        id: str,\n        use_cache: Literal[\"on\", \"off\"],\n    ) -> Tuple:\n        # Load the references audio and text by id\n        ref_folder = Path(\"references\") / id\n        ref_folder.mkdir(parents=True, exist_ok=True)\n        ref_audios = list_files(\n            ref_folder, AUDIO_EXTENSIONS, recursive=True, sort=False\n        )\n\n        if use_cache == \"off\" or id not in self.ref_by_id:\n            # If the references are not already loaded, encode them\n            prompt_tokens = [\n                self.encode_reference(\n                    # decoder_model=self.decoder_model,\n                    reference_audio=audio_to_bytes(str(ref_audio)),\n                    enable_reference_audio=True,\n                )\n                for ref_audio in ref_audios\n            ]\n            prompt_texts = [\n                read_ref_text(str(ref_audio.with_suffix(\".lab\")))\n                for ref_audio in ref_audios\n            ]\n            self.ref_by_id[id] = (prompt_tokens, prompt_texts)\n\n        else:\n            # Reuse already encoded references\n            logger.info(\"Use same references\")\n            prompt_tokens, prompt_texts = self.ref_by_id[id]\n\n        return prompt_tokens, prompt_texts\n\n    def load_by_hash(\n        self,\n        references: list[ServeReferenceAudio],\n        use_cache: Literal[\"on\", \"off\"],\n    ) -> Tuple:\n        # Load the references audio and text by hash\n        audio_hashes = [sha256(ref.audio).hexdigest() for ref in references]\n\n        cache_used = False\n        prompt_tokens, prompt_texts = [], []\n        for i, ref in enumerate(references):\n            if use_cache == \"off\" or audio_hashes[i] not in self.ref_by_hash:\n                # If the references are not already loaded, encode them\n                prompt_tokens.append(\n                    self.encode_reference(\n                        reference_audio=ref.audio,\n                        enable_reference_audio=True,\n                    )\n                )\n                prompt_texts.append(ref.text)\n                self.ref_by_hash[audio_hashes[i]] = (prompt_tokens[-1], ref.text)\n\n            else:\n                # Reuse already encoded references\n                cached_token, cached_text = self.ref_by_hash[audio_hashes[i]]\n                prompt_tokens.append(cached_token)\n                prompt_texts.append(cached_text)\n                cache_used = True\n\n        if cache_used:\n            logger.info(\"Use same references\")\n\n        return prompt_tokens, prompt_texts\n\n    def load_audio(self, reference_audio: bytes | str, sr: int):\n        \"\"\"\n        Load the audio data from a file or bytes.\n        \"\"\"\n        if len(reference_audio) > 255 or not Path(reference_audio).exists():\n            audio_data = reference_audio\n            reference_audio = io.BytesIO(audio_data)\n\n        waveform, original_sr = torchaudio.load(reference_audio, backend=self.backend)\n\n        if waveform.shape[0] > 1:\n            waveform = torch.mean(waveform, dim=0, keepdim=True)\n\n        if original_sr != sr:\n            resampler = torchaudio.transforms.Resample(\n                orig_freq=original_sr, new_freq=sr\n            )\n            waveform = resampler(waveform)\n\n        audio = waveform.squeeze().numpy()\n        return audio\n\n    def list_reference_ids(self) -> list[str]:\n        \"\"\"\n        List all valid reference IDs (subdirectory names containing valid audio and .lab files).\n\n        Returns:\n            list[str]: List of valid reference IDs\n        \"\"\"\n        ref_base_path = Path(\"references\")\n        if not ref_base_path.exists():\n            return []\n\n        valid_ids = []\n        for ref_dir in ref_base_path.iterdir():\n            if not ref_dir.is_dir():\n                continue\n\n            # Check if directory contains at least one audio file and corresponding .lab file\n            audio_files = list_files(\n                ref_dir, AUDIO_EXTENSIONS, recursive=False, sort=False\n            )\n            if not audio_files:\n                continue\n\n            # Check if corresponding .lab file exists for at least one audio file\n            has_valid_pair = False\n            for audio_file in audio_files:\n                lab_file = audio_file.with_suffix(\".lab\")\n                if lab_file.exists():\n                    has_valid_pair = True\n                    break\n\n            if has_valid_pair:\n                valid_ids.append(ref_dir.name)\n\n        return sorted(valid_ids)\n\n    def add_reference(self, id: str, wav_file_path: str, reference_text: str) -> None:\n        \"\"\"\n        Add a new reference voice by creating a new directory and copying files.\n\n        Args:\n            id: Reference ID (directory name)\n            wav_file_path: Path to the audio file to copy\n            reference_text: Text content for the .lab file\n\n        Raises:\n            FileExistsError: If the reference ID already exists\n            FileNotFoundError: If the audio file doesn't exist\n            OSError: If file operations fail\n        \"\"\"\n        # Validate ID format\n        import re\n\n        if not re.match(r\"^[a-zA-Z0-9\\-_ ]+$\", id):\n            raise ValueError(\n                \"Reference ID contains invalid characters. Only alphanumeric, hyphens, underscores, and spaces are allowed.\"\n            )\n\n        if len(id) > 255:\n            raise ValueError(\n                \"Reference ID is too long. Maximum length is 255 characters.\"\n            )\n\n        # Check if reference already exists\n        ref_dir = Path(\"references\") / id\n        if ref_dir.exists():\n            raise FileExistsError(f\"Reference ID '{id}' already exists\")\n\n        # Check if audio file exists\n        audio_path = Path(wav_file_path)\n        if not audio_path.exists():\n            raise FileNotFoundError(f\"Audio file not found: {wav_file_path}\")\n\n        # Validate audio file extension\n        if audio_path.suffix.lower() not in AUDIO_EXTENSIONS:\n            raise ValueError(\n                f\"Unsupported audio format: {audio_path.suffix}. Supported formats: {', '.join(AUDIO_EXTENSIONS)}\"\n            )\n\n        try:\n            # Create reference directory\n            ref_dir.mkdir(parents=True, exist_ok=False)\n\n            # Determine the target audio filename with original extension\n            target_audio_path = ref_dir / f\"sample{audio_path.suffix}\"\n\n            # Copy audio file\n            import shutil\n\n            shutil.copy2(audio_path, target_audio_path)\n\n            # Create .lab file\n            lab_path = ref_dir / \"sample.lab\"\n            with open(lab_path, \"w\", encoding=\"utf-8\") as f:\n                f.write(reference_text)\n\n            # Clear cache for this ID if it exists\n            if id in self.ref_by_id:\n                del self.ref_by_id[id]\n\n            logger.info(f\"Successfully added reference voice with ID: {id}\")\n\n        except Exception as e:\n            # Clean up on failure\n            if ref_dir.exists():\n                import shutil\n\n                shutil.rmtree(ref_dir)\n            raise e\n\n    def delete_reference(self, id: str) -> None:\n        \"\"\"\n        Delete a reference voice by removing its directory and files.\n\n        Args:\n            id: Reference ID (directory name) to delete\n\n        Raises:\n            FileNotFoundError: If the reference ID doesn't exist\n            OSError: If file operations fail\n        \"\"\"\n        # Check if reference exists\n        ref_dir = Path(\"references\") / id\n        if not ref_dir.exists():\n            raise FileNotFoundError(f\"Reference ID '{id}' does not exist\")\n\n        try:\n            # Remove the entire reference directory\n            import shutil\n\n            shutil.rmtree(ref_dir)\n\n            # Clear cache for this ID if it exists\n            if id in self.ref_by_id:\n                del self.ref_by_id[id]\n\n            logger.info(f\"Successfully deleted reference voice with ID: {id}\")\n\n        except Exception as e:\n            logger.error(f\"Failed to delete reference '{id}': {e}\")\n            raise OSError(f\"Failed to delete reference '{id}': {e}\")\n"
  },
  {
    "path": "fish_speech/inference_engine/utils.py",
    "content": "import io\nimport wave\nfrom dataclasses import dataclass\nfrom typing import Literal, Optional, Tuple\n\nimport numpy as np\n\n\n@dataclass\nclass InferenceResult:\n    code: Literal[\"header\", \"segment\", \"error\", \"final\"]\n    audio: Optional[Tuple[int, np.ndarray]]\n    error: Optional[Exception]\n\n\ndef wav_chunk_header(\n    sample_rate: int = 44100, bit_depth: int = 16, channels: int = 1\n) -> bytes:\n    buffer = io.BytesIO()\n\n    with wave.open(buffer, \"wb\") as wav_file:\n        wav_file.setnchannels(channels)\n        wav_file.setsampwidth(bit_depth // 8)\n        wav_file.setframerate(sample_rate)\n\n    wav_header_bytes = buffer.getvalue()\n    buffer.close()\n\n    return wav_header_bytes\n"
  },
  {
    "path": "fish_speech/inference_engine/vq_manager.py",
    "content": "from typing import Callable\n\nimport torch\nfrom loguru import logger\n\nfrom fish_speech.models.dac.modded_dac import DAC\n\n\nclass VQManager:\n\n    def __init__(self):\n        # Make Pylance happy (attribut/method not defined...)\n        self.decoder_model: DAC\n        self.load_audio: Callable\n\n    def decode_vq_tokens(self, codes):\n        logger.info(f\"VQ features: {codes.shape}\")\n\n        if isinstance(self.decoder_model, DAC):\n            return self.decoder_model.from_indices(codes[None])[0].squeeze()\n\n        raise ValueError(f\"Unknown model type: {type(self.decoder_model)}\")\n\n    def encode_reference(self, reference_audio, enable_reference_audio):\n        if enable_reference_audio and reference_audio is not None:\n            # Load audios, and prepare basic info here\n            if hasattr(self.decoder_model, \"spec_transform\"):\n                sample_rate = self.decoder_model.spec_transform.sample_rate\n            else:\n                sample_rate = self.decoder_model.sample_rate\n            reference_audio_content = self.load_audio(reference_audio, sample_rate)\n\n            audios = torch.from_numpy(reference_audio_content).to(\n                self.decoder_model.device\n            )[None, None, :]\n            audio_lengths = torch.tensor(\n                [audios.shape[2]], device=self.decoder_model.device, dtype=torch.long\n            )\n            logger.info(\n                f\"Loaded audio with {audios.shape[2] / sample_rate:.2f} seconds\"\n            )\n\n            # VQ Encoder\n            if isinstance(self.decoder_model, DAC):\n                prompt_tokens = self.decoder_model.encode(audios, audio_lengths)[0][0]\n                logger.info(f\"Encoded prompt: {prompt_tokens.shape}\")\n            else:\n                raise ValueError(f\"Unknown model type: {type(self.decoder_model)}\")\n        else:\n            prompt_tokens = None\n            logger.info(\"No reference audio provided\")\n\n        return prompt_tokens\n"
  },
  {
    "path": "fish_speech/models/dac/__init__.py",
    "content": ""
  },
  {
    "path": "fish_speech/models/dac/inference.py",
    "content": "from pathlib import Path\n\nimport click\nimport hydra\nimport numpy as np\nimport pyrootutils\nimport soundfile as sf\nimport torch\nimport torchaudio\nfrom hydra import compose, initialize\nfrom hydra.utils import instantiate\nfrom loguru import logger\nfrom omegaconf import OmegaConf\n\npyrootutils.setup_root(__file__, indicator=\".project-root\", pythonpath=True)\n\nfrom fish_speech.utils.file import AUDIO_EXTENSIONS\n\n# register eval resolver\nOmegaConf.register_new_resolver(\"eval\", eval)\n\n\ndef load_model(config_name, checkpoint_path, device=\"cuda\"):\n    hydra.core.global_hydra.GlobalHydra.instance().clear()\n    with initialize(version_base=\"1.3\", config_path=\"../../configs\"):\n        cfg = compose(config_name=config_name)\n\n    model = instantiate(cfg)\n    state_dict = torch.load(\n        checkpoint_path, map_location=device, mmap=True, weights_only=True\n    )\n    if \"state_dict\" in state_dict:\n        state_dict = state_dict[\"state_dict\"]\n\n    if any(\"generator\" in k for k in state_dict):\n        state_dict = {\n            k.replace(\"generator.\", \"\"): v\n            for k, v in state_dict.items()\n            if \"generator.\" in k\n        }\n\n    result = model.load_state_dict(state_dict, strict=False, assign=True)\n    model.eval()\n    model.to(device)\n\n    logger.info(f\"Loaded model: {result}\")\n    return model\n\n\n@torch.no_grad()\n@click.command()\n@click.option(\n    \"--input-path\",\n    \"-i\",\n    default=\"test.wav\",\n    type=click.Path(exists=True, path_type=Path),\n)\n@click.option(\n    \"--output-path\", \"-o\", default=\"fake.wav\", type=click.Path(path_type=Path)\n)\n@click.option(\"--config-name\", default=\"modded_dac_vq\")\n@click.option(\n    \"--checkpoint-path\",\n    default=\"checkpoints/openaudio-s1-mini/codec.pth\",\n)\n@click.option(\n    \"--device\",\n    \"-d\",\n    default=\"cuda\",\n)\ndef main(input_path, output_path, config_name, checkpoint_path, device):\n    model = load_model(config_name, checkpoint_path, device=device)\n\n    if input_path.suffix in AUDIO_EXTENSIONS:\n        logger.info(f\"Processing in-place reconstruction of {input_path}\")\n\n        # Load audio\n        audio, sr = torchaudio.load(str(input_path))\n        if audio.shape[0] > 1:\n            audio = audio.mean(0, keepdim=True)\n        audio = torchaudio.functional.resample(audio, sr, model.sample_rate)\n\n        audios = audio[None].to(device)\n        logger.info(\n            f\"Loaded audio with {audios.shape[2] / model.sample_rate:.2f} seconds\"\n        )\n\n        # VQ Encoder\n        audio_lengths = torch.tensor([audios.shape[2]], device=device, dtype=torch.long)\n        indices, _ = model.encode(audios, audio_lengths)\n\n        if indices.ndim == 3:\n            indices = indices[0]\n\n        logger.info(f\"Generated indices of shape {indices.shape}\")\n\n        # Save indices\n        np.save(output_path.with_suffix(\".npy\"), indices.cpu().numpy())\n    elif input_path.suffix == \".npy\":\n        logger.info(f\"Processing precomputed indices from {input_path}\")\n        indices = np.load(input_path)\n        indices = torch.from_numpy(indices).to(device).long()\n        assert indices.ndim == 2, f\"Expected 2D indices, got {indices.ndim}\"\n        # indices_lens = torch.tensor([indices.shape[1]], device=device, dtype=torch.long)\n    else:\n        raise ValueError(f\"Unknown input type: {input_path}\")\n\n    # Restore\n    if indices.ndim == 2:\n        indices = indices.unsqueeze(0)\n\n    fake_audios = model.from_indices(indices)\n    audio_time = fake_audios.shape[-1] / model.sample_rate\n\n    logger.info(\n        f\"Generated audio of shape {fake_audios.shape}, equivalent to {audio_time:.2f} seconds from {indices.shape[1]} features, features/second: {indices.shape[1] / audio_time:.2f}\"\n    )\n\n    # Save audio\n    fake_audio = fake_audios[0, 0].float().cpu().numpy()\n    sf.write(output_path, fake_audio, model.sample_rate)\n    logger.info(f\"Saved audio to {output_path}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "fish_speech/models/dac/modded_dac.py",
    "content": "import math\nimport typing as tp\nfrom dataclasses import dataclass\nfrom typing import List, Optional, Union\n\nimport numpy as np\nimport torch\nfrom audiotools import AudioSignal\nfrom audiotools.ml import BaseModel\nfrom dac.model.base import CodecMixin\nfrom dac.nn.layers import Snake1d, WNConv1d, WNConvTranspose1d\nfrom torch import Tensor, nn\nfrom torch.nn import functional as F\nfrom torch.nn.utils.parametrizations import weight_norm\nfrom torch.nn.utils.parametrize import remove_parametrizations\n\n\n@dataclass\nclass VQResult:\n    z: torch.Tensor\n    codes: torch.Tensor\n    latents: torch.Tensor\n    codebook_loss: torch.Tensor\n    commitment_loss: torch.Tensor\n    semantic_distill_z: torch.Tensor | None = None\n\n\ndef find_multiple(n: int, k: int) -> int:\n    if n % k == 0:\n        return n\n    return n + k - (n % k)\n\n\n@dataclass\nclass ModelArgs:\n    block_size: int = 2048\n    n_layer: int = 8\n    n_head: int = 8\n    dim: int = 512\n    intermediate_size: int = 1536\n    n_local_heads: int = -1\n    head_dim: int = 64\n    rope_base: float = 10000\n    norm_eps: float = 1e-5\n    dropout_rate: float = 0.1\n    attn_dropout_rate: float = 0.1\n    channels_first: bool = True  # to be compatible with conv1d input/output\n    pos_embed_type: str = \"rope\"  # can be \"rope\" or \"conformer\"\n    max_relative_position: int = 128  # for conformer-style relative position embedding\n    window_size: int = 512  # for window limited attention\n\n    def __post_init__(self):\n        if self.n_local_heads == -1:\n            self.n_local_heads = self.n_head\n        if self.intermediate_size is None:\n            hidden_dim = 4 * self.dim\n            n_hidden = int(2 * hidden_dim / 3)\n            self.intermediate_size = find_multiple(n_hidden, 256)\n        assert self.pos_embed_type in [\n            \"rope\",\n            \"conformer\",\n        ], \"pos_embed_type must be either 'rope' or 'conformer'\"\n\n\nclass KVCache(nn.Module):\n    def __init__(\n        self, max_batch_size, max_seq_length, n_heads, head_dim, dtype=torch.bfloat16\n    ):\n        super().__init__()\n        cache_shape = (max_batch_size, n_heads, max_seq_length, head_dim)\n        self.register_buffer(\"k_cache\", torch.zeros(cache_shape, dtype=dtype))\n        self.register_buffer(\"v_cache\", torch.zeros(cache_shape, dtype=dtype))\n\n    def update(self, input_pos, k_val, v_val):\n        # input_pos: [S], k_val: [B, H, S, D]\n        assert input_pos.shape[0] == k_val.shape[2]\n\n        k_out = self.k_cache\n        v_out = self.v_cache\n        k_out[:, :, input_pos] = k_val\n        v_out[:, :, input_pos] = v_val\n\n        return (\n            k_out[:, :, : input_pos.max() + 1, :],\n            v_out[:, :, : input_pos.max() + 1, :],\n        )\n\n    def clear_cache(self, prompt_len):\n        self.k_cache[:, :, prompt_len:, :] = torch.zeros_like(\n            self.k_cache[:, :, prompt_len:, :]\n        )\n        self.v_cache[:, :, prompt_len:, :] = torch.zeros_like(\n            self.v_cache[:, :, prompt_len:, :]\n        )\n\n\nclass Transformer(nn.Module):\n    def __init__(self, config: ModelArgs) -> None:\n        super().__init__()\n        self.config = config\n\n        self.layers = nn.ModuleList(\n            TransformerBlock(config) for _ in range(config.n_layer)\n        )\n        self.norm = RMSNorm(config.dim, eps=config.norm_eps)\n\n        # Only compute RoPE frequencies if using RoPE\n        if config.pos_embed_type == \"rope\":\n            freqs_cis = precompute_freqs_cis(\n                327680, self.config.head_dim, self.config.rope_base\n            )\n            self.register_buffer(\"freqs_cis\", freqs_cis, persistent=False)\n        else:\n            self.register_buffer(\"freqs_cis\", None)\n\n        causal_mask = torch.tril(torch.ones(32768, 32768, dtype=torch.bool))\n        self.register_buffer(\"causal_mask\", causal_mask, persistent=False)\n\n        self.max_batch_size = -1\n        self.max_seq_length = -1\n        self.use_kv_cache = False\n\n    def setup_caches(self, max_batch_size, max_seq_length):\n        \"\"\"\n        This method will only be called during inference when using KV cache.\n        \"\"\"\n        head_dim = self.config.dim // self.config.n_head\n        max_seq_length = find_multiple(max_seq_length, 8)\n        self.max_seq_length = max_seq_length\n        self.max_batch_size = max_batch_size\n        dtype = self.norm.weight.dtype\n        device = self.norm.weight.device\n\n        for b in self.layers:\n            b.attention.kv_cache = KVCache(\n                max_batch_size,\n                max_seq_length,\n                self.config.n_local_heads,\n                head_dim,\n                dtype,\n            ).to(device)\n\n        self.use_kv_cache = True\n\n    def forward(\n        self,\n        x: Tensor,\n        input_pos: Optional[Tensor] = None,\n        mask: Optional[Tensor] = None,\n    ) -> Tensor:\n        if self.config.pos_embed_type == \"rope\":\n            assert (\n                self.freqs_cis is not None\n            ), \"RoPE frequencies must be initialized for RoPE positional embedding\"\n            # print(\"MAX\", input_pos.max())\n            freqs_cis = self.freqs_cis[input_pos]\n        else:\n            freqs_cis = None\n\n        if mask is None:  # in case of non-causal model\n            if not self.training and self.use_kv_cache:\n                mask = self.causal_mask[None, None, input_pos]\n                mask = mask[..., : input_pos.max() + 1]\n            else:\n                mask = self.causal_mask[None, None, input_pos]\n                mask = mask[..., input_pos]\n\n        for i, layer in enumerate(self.layers):\n            x = layer(x, input_pos, freqs_cis, mask)\n        x = self.norm(x)\n        return x\n\n\nclass TransformerBlock(nn.Module):\n    def __init__(self, config: ModelArgs) -> None:\n        super().__init__()\n        self.attention = Attention(config)\n        self.feed_forward = FeedForward(config)\n        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)\n        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)\n        self.attention_layer_scale = LayerScale(config.dim, inplace=True)\n        self.ffn_layer_scale = LayerScale(config.dim, inplace=True)\n\n    def forward(\n        self,\n        x: Tensor,\n        input_pos: Tensor,\n        freqs_cis: Tensor,\n        mask: Tensor,\n    ) -> Tensor:\n        h = x + self.attention_layer_scale(\n            self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)\n        )\n        out = h + self.ffn_layer_scale(self.feed_forward(self.ffn_norm(h)))\n        return out\n\n\nclass Attention(nn.Module):\n    def __init__(self, config: ModelArgs):\n        super().__init__()\n        assert config.dim % config.n_head == 0\n\n        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim\n        # key, query, value projections for all heads, but in a batch\n        self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)\n        self.wo = nn.Linear(config.head_dim * config.n_head, config.dim, bias=False)\n        self.kv_cache = None\n\n        self.n_head = config.n_head\n        self.head_dim = config.head_dim\n        self.n_local_heads = config.n_local_heads\n        self.dim = config.dim\n        self.attn_dropout_rate = config.attn_dropout_rate\n        self.pos_embed_type = config.pos_embed_type\n\n        # Add relative position embedding for conformer-style\n        if self.pos_embed_type == \"conformer\":\n            self.max_relative_position = config.max_relative_position\n            num_pos_embeddings = 2 * config.max_relative_position + 1\n            self.rel_pos_embeddings = nn.Parameter(\n                torch.zeros(num_pos_embeddings, self.head_dim)\n            )\n            nn.init.normal_(self.rel_pos_embeddings, mean=0.0, std=0.02)\n\n    def _compute_conformer_pos_scores(self, q: Tensor, seqlen: int) -> Tensor:\n        # q: [B, H, S, D]\n        # Returns: [B, H, S, S]\n        positions = torch.arange(seqlen, device=q.device)\n        relative_positions = positions.unsqueeze(1) - positions.unsqueeze(0)  # [S, S]\n        relative_positions = torch.clamp(\n            relative_positions + self.max_relative_position,\n            0,\n            2 * self.max_relative_position,\n        )\n        rel_embeddings = self.rel_pos_embeddings[relative_positions]  # [S, S, D]\n\n        # Compute attention scores with relative position embeddings\n        q = q.transpose(1, 2)  # [B, S, H, D]\n        rel_logits = torch.matmul(q, rel_embeddings.transpose(-2, -1))  # [B, S, H, S]\n        rel_logits = rel_logits.transpose(1, 2)  # [B, H, S, S]\n        return rel_logits\n\n    def forward(\n        self,\n        x: Tensor,\n        freqs_cis: Tensor,\n        mask: Tensor,\n        input_pos: Optional[Tensor] = None,\n    ) -> Tensor:\n        bsz, seqlen, _ = x.shape\n\n        kv_size = self.n_local_heads * self.head_dim\n        q, k, v = self.wqkv(x).split([kv_size, kv_size, kv_size], dim=-1)\n        context_seqlen = seqlen\n\n        q = q.view(bsz, seqlen, self.n_head, self.head_dim)\n        k = k.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)\n        v = v.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)\n\n        if self.pos_embed_type == \"rope\":\n            q = apply_rotary_emb(q, freqs_cis)\n            k = apply_rotary_emb(k, freqs_cis)\n\n        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))\n\n        if self.kv_cache is not None:\n            k, v = self.kv_cache.update(input_pos, k, v)\n\n        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)\n        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)\n\n        if self.pos_embed_type == \"conformer\":\n            # Compute attention scores\n            scale = 1.0 / math.sqrt(self.head_dim)\n            scores = torch.matmul(q, k.transpose(-2, -1)) * scale\n\n            # Add relative position embeddings for conformer-style\n            rel_scores = self._compute_conformer_pos_scores(q, seqlen)\n            scores = scores + rel_scores\n\n            # Apply attention\n            if mask is not None:\n                scores = scores.masked_fill(~mask, float(\"-inf\"))\n\n            attn = F.softmax(scores, dim=-1)\n            if self.attn_dropout_rate > 0 and self.training:\n                attn = F.dropout(attn, p=self.attn_dropout_rate)\n\n            y = torch.matmul(attn, v)\n        else:\n            y = F.scaled_dot_product_attention(\n                q,\n                k,\n                v,\n                dropout_p=self.attn_dropout_rate if self.training else 0.0,\n                attn_mask=mask,\n            )\n            # is_causal=True)\n        y = (\n            y.transpose(1, 2)\n            .contiguous()\n            .view(bsz, seqlen, self.head_dim * self.n_head)\n        )\n        y = self.wo(y)\n        return y\n\n\nclass FeedForward(nn.Module):\n    def __init__(self, config: ModelArgs) -> None:\n        super().__init__()\n        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)\n        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)\n        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)\n        self.dropout = nn.Dropout(config.dropout_rate)\n\n    def forward(self, x: Tensor) -> Tensor:\n        return self.w2(self.dropout(F.silu(self.w1(x)) * self.w3(x)))\n\n\nclass RMSNorm(nn.Module):\n    def __init__(self, dim: int, eps: float = 1e-5):\n        super().__init__()\n        self.eps = eps\n        self.weight = nn.Parameter(torch.ones(dim))\n\n    def _norm(self, x):\n        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)\n\n    def forward(self, x: Tensor) -> Tensor:\n        output = self._norm(x.float()).type_as(x)\n        return output * self.weight\n\n\nclass LayerScale(nn.Module):\n    def __init__(\n        self,\n        dim: int,\n        init_values: Union[float, Tensor] = 1e-2,\n        inplace: bool = False,\n    ) -> None:\n        super().__init__()\n        self.inplace = inplace\n        self.gamma = nn.Parameter(init_values * torch.ones(dim))\n\n    def forward(self, x: Tensor) -> Tensor:\n        return x.mul_(self.gamma) if self.inplace else x * self.gamma\n\n\nclass WindowLimitedTransformer(Transformer):\n    \"\"\"\n    Transformer with window limited attention, causal.\n    \"\"\"\n\n    def __init__(\n        self,\n        config: ModelArgs,\n        input_dim: int = 512,\n        window_size: Optional[int] = None,\n        causal: bool = True,\n        look_ahead_conv: nn.Module = None,\n    ):\n        super().__init__(config)\n        self.window_size = window_size\n        self.causal = causal\n        self.channels_first = config.channels_first\n        self.look_ahead_conv = (\n            look_ahead_conv if look_ahead_conv is not None else nn.Identity()\n        )\n        self.input_proj = (\n            nn.Linear(input_dim, config.dim)\n            if input_dim != config.dim\n            else nn.Identity()\n        )\n        self.output_proj = (\n            nn.Linear(config.dim, input_dim)\n            if input_dim != config.dim\n            else nn.Identity()\n        )\n\n    def make_window_limited_mask(\n        self,\n        max_length: int,\n        x_lens: Optional[Tensor] = None,\n    ) -> Tensor:\n        \"\"\"\n        Make mask to form window limited attention.\n        \"\"\"\n        if self.causal:\n            mask = torch.tril(torch.ones(max_length, max_length))\n            row_indices = torch.arange(max_length).view(-1, 1)\n            window_size = self.window_size or max_length\n            valid_range = (row_indices - window_size + 1).clamp(min=0)\n            column_indices = torch.arange(max_length)\n            mask = (column_indices >= valid_range) & mask.bool()\n        else:\n            raise NotImplementedError\n        mask = mask.bool()[None, None]\n        return mask\n\n    def make_mask(\n        self,\n        max_length: int,\n        x_lens: Optional[Tensor] = None,\n    ) -> Tensor:\n        \"\"\"\n        Make ordinary mask if window size is not specified.\n        \"\"\"\n        if self.causal:\n            mask = torch.tril(torch.ones(max_length, max_length))\n        else:\n            mask = torch.ones(max_length, max_length)\n            mask = mask.bool()[None, None]\n            for i, x_len in enumerate(x_lens):\n                mask[:x_len, i] = 0\n        mask = mask.bool()[None, None]\n        return mask\n\n    def forward(\n        self,\n        x: Tensor,\n        x_lens: Optional[Tensor] = None,\n    ) -> Tensor:\n        if self.channels_first:\n            x = x.transpose(1, 2)\n        x = self.input_proj(x)  # (B, T, D)\n        x = self.look_ahead_conv(x)\n        input_pos = torch.arange(x.shape[1], device=x.device)\n        # construct mask to form window limited attention\n        max_length = x.shape[1]\n        if self.window_size is not None:\n            mask = self.make_window_limited_mask(max_length, x_lens)\n        else:\n            mask = self.make_mask(max_length, x_lens)\n        mask = mask.to(x.device)\n        x = super().forward(x, input_pos, mask)\n        x = self.output_proj(x)  # (B, T, D)\n        if self.channels_first:\n            x = x.transpose(1, 2)\n        return x\n\n\ndef precompute_freqs_cis(\n    seq_len: int, n_elem: int, base: int = 10000, dtype: torch.dtype = torch.bfloat16\n) -> Tensor:\n    freqs = 1.0 / (\n        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)\n    )\n    t = torch.arange(seq_len, device=freqs.device)\n    freqs = torch.outer(t, freqs)\n    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)\n    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)\n    return cache.to(dtype=dtype)\n\n\ndef apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:\n    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)\n    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)\n    x_out2 = torch.stack(\n        [\n            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],\n            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],\n        ],\n        -1,\n    )\n\n    x_out2 = x_out2.flatten(3)\n    return x_out2.type_as(x)\n\n\ndef init_weights(m):\n    if isinstance(m, nn.Conv1d):\n        nn.init.trunc_normal_(m.weight, std=0.02)\n        nn.init.constant_(m.bias, 0)\n\n\ndef unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):\n    \"\"\"Remove padding from x, handling properly zero padding. Only for 1d!\"\"\"\n    padding_left, padding_right = paddings\n    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)\n    assert (padding_left + padding_right) <= x.shape[-1]\n    end = x.shape[-1] - padding_right\n    return x[..., padding_left:end]\n\n\ndef get_extra_padding_for_conv1d(\n    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0\n) -> int:\n    \"\"\"See `pad_for_conv1d`.\"\"\"\n    length = x.shape[-1]\n    n_frames = (length - kernel_size + padding_total) / stride + 1\n    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)\n    return ideal_length - length\n\n\ndef pad1d(\n    x: torch.Tensor,\n    paddings: tp.Tuple[int, int],\n    mode: str = \"zeros\",\n    value: float = 0.0,\n):\n    \"\"\"Tiny wrapper around F.pad, just to allow for reflect padding on small input.\n    If this is the case, we insert extra 0 padding to the right\n    before the reflection happen.\n    \"\"\"\n    length = x.shape[-1]\n    padding_left, padding_right = paddings\n    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)\n    if mode == \"reflect\":\n        max_pad = max(padding_left, padding_right)\n        extra_pad = 0\n        if length <= max_pad:\n            extra_pad = max_pad - length + 1\n            x = F.pad(x, (0, extra_pad))\n        padded = F.pad(x, paddings, mode, value)\n        end = padded.shape[-1] - extra_pad\n        return padded[..., :end]\n    else:\n        return F.pad(x, paddings, mode, value)\n\n\nclass CausalConvNet(nn.Module):\n    def __init__(\n        self,\n        in_channels,\n        out_channels,\n        kernel_size,\n        dilation=1,\n        stride=1,\n        groups=1,\n        padding=None,\n    ):\n        super(CausalConvNet, self).__init__()\n        self.conv = nn.Conv1d(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            dilation=dilation,\n            groups=groups,\n        )\n        self.stride = stride\n        self.kernel_size = (kernel_size - 1) * dilation + 1\n        self.dilation = dilation\n        self.padding = self.kernel_size - self.stride\n\n    def forward(self, x):\n        pad = self.padding\n        extra_padding = get_extra_padding_for_conv1d(\n            x, self.kernel_size, self.stride, pad\n        )\n        x = pad1d(x, (pad, extra_padding), mode=\"constant\", value=0)\n        return self.conv(x).contiguous()\n\n    def weight_norm(self, name=\"weight\", dim=0):\n        self.conv = weight_norm(self.conv, name=name, dim=dim)\n        return self\n\n    def remove_weight_norm(self):\n        self.conv = remove_parametrizations(self.conv)\n        return self\n\n\nclass CausalTransConvNet(nn.Module):\n    def __init__(\n        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, padding=None\n    ):\n        super(CausalTransConvNet, self).__init__()\n        self.conv = nn.ConvTranspose1d(\n            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation\n        )\n        self.stride = stride\n        self.kernel_size = kernel_size\n\n    def forward(self, x):\n        x = self.conv(x)\n        pad = self.kernel_size - self.stride\n        padding_right = math.ceil(pad)\n        padding_left = pad - padding_right\n        x = unpad1d(x, (padding_left, padding_right))\n        return x.contiguous()\n\n    def weight_norm(self, name=\"weight\", dim=0):\n        self.conv = weight_norm(self.conv, name=name, dim=dim)\n        return self\n\n    def remove_weight_norm(self):\n        self.conv = remove_parametrizations(self.conv)\n        return self\n\n\ndef CausalWNConv1d(*args, **kwargs):\n    return CausalConvNet(*args, **kwargs).weight_norm()\n\n\ndef CausalWNConvTranspose1d(*args, **kwargs):\n    return CausalTransConvNet(*args, **kwargs).weight_norm()\n\n\nclass ResidualUnit(nn.Module):\n    def __init__(self, dim: int = 16, dilation: int = 1, causal: bool = False):\n        super().__init__()\n        conv_class = CausalWNConv1d if causal else WNConv1d\n        pad = ((7 - 1) * dilation) // 2\n        self.block = nn.Sequential(\n            Snake1d(dim),\n            conv_class(dim, dim, kernel_size=7, dilation=dilation, padding=pad),\n            Snake1d(dim),\n            conv_class(dim, dim, kernel_size=1),\n        )\n        self.causal = causal\n\n    def forward(self, x):\n        y = self.block(x)\n        pad = x.shape[-1] - y.shape[-1]\n        if pad > 0:\n            if self.causal:\n                x = x[..., :-pad]\n            else:\n                x = x[..., pad // 2 : -pad // 2]\n        return x + y\n\n\nclass EncoderBlock(nn.Module):\n    def __init__(\n        self,\n        dim: int = 16,\n        stride: int = 1,\n        causal: bool = False,\n        n_t_layer: int = 0,\n        transformer_general_config=None,\n    ):\n        super().__init__()\n        conv_class = CausalWNConv1d if causal else WNConv1d\n        transformer_module = (\n            nn.Identity()\n            if n_t_layer == 0\n            else (\n                WindowLimitedTransformer(\n                    causal=causal,\n                    input_dim=dim,\n                    window_size=getattr(transformer_general_config, \"window_size\", 512),\n                    config=transformer_general_config(\n                        n_layer=n_t_layer,\n                        n_head=dim // 64,\n                        dim=dim,\n                        intermediate_size=dim * 3,\n                    ),\n                )\n            )\n        )\n        self.block = nn.Sequential(\n            ResidualUnit(dim // 2, dilation=1, causal=causal),\n            ResidualUnit(dim // 2, dilation=3, causal=causal),\n            ResidualUnit(dim // 2, dilation=9, causal=causal),\n            Snake1d(dim // 2),\n            conv_class(\n                dim // 2,\n                dim,\n                kernel_size=2 * stride,\n                stride=stride,\n                padding=math.ceil(stride / 2),\n            ),\n            transformer_module,\n        )\n\n    def forward(self, x):\n        return self.block(x)\n\n\nclass Encoder(nn.Module):\n    def __init__(\n        self,\n        d_model: int = 64,\n        strides: list = [2, 4, 8, 8],\n        d_latent: int = 64,\n        n_transformer_layers: list = [0, 0, 4, 4],\n        transformer_general_config: ModelArgs = None,\n        causal: bool = False,\n    ):\n        super().__init__()\n        conv_class = CausalWNConv1d if causal else WNConv1d\n        # Create first convolution\n        self.block = [conv_class(1, d_model, kernel_size=7, padding=3)]\n\n        # Create EncoderBlocks that double channels as they downsample by `stride`\n        for stride, n_t_layer in zip(strides, n_transformer_layers):\n            d_model *= 2\n            self.block += [\n                EncoderBlock(\n                    d_model,\n                    stride=stride,\n                    causal=causal,\n                    n_t_layer=n_t_layer,\n                    transformer_general_config=transformer_general_config,\n                )\n            ]\n\n        # Create last convolution\n        self.block += [\n            Snake1d(d_model),\n            conv_class(d_model, d_latent, kernel_size=3, padding=1),\n        ]\n\n        # Wrap black into nn.Sequential\n        self.block = nn.Sequential(*self.block)\n        self.enc_dim = d_model\n\n    def forward(self, x):\n        return self.block(x)\n\n\nclass DecoderBlock(nn.Module):\n    def __init__(\n        self,\n        input_dim: int = 16,\n        output_dim: int = 8,\n        stride: int = 1,\n        causal: bool = False,\n        n_t_layer: int = 0,\n        transformer_general_config=None,\n    ):\n        super().__init__()\n        conv_trans_class = CausalWNConvTranspose1d if causal else WNConvTranspose1d\n        transformer_module = (\n            nn.Identity()\n            if n_t_layer == 0\n            else (\n                WindowLimitedTransformer(\n                    causal=causal,\n                    input_dim=input_dim,\n                    window_size=None,\n                    config=transformer_general_config(\n                        n_layer=n_t_layer,\n                        n_head=input_dim // 64,\n                        dim=input_dim,\n                        intermediate_size=input_dim * 3,\n                    ),\n                )\n            )\n        )\n        self.block = nn.Sequential(\n            # transformer_module,\n            Snake1d(input_dim),\n            conv_trans_class(\n                input_dim,\n                output_dim,\n                kernel_size=2 * stride,\n                stride=stride,\n                padding=math.ceil(stride / 2),\n            ),\n            ResidualUnit(output_dim, dilation=1, causal=causal),\n            ResidualUnit(output_dim, dilation=3, causal=causal),\n            ResidualUnit(output_dim, dilation=9, causal=causal),\n        )\n\n    def forward(self, x):\n        return self.block(x)\n\n\nclass Decoder(nn.Module):\n    def __init__(\n        self,\n        input_channel,\n        channels,\n        rates,\n        d_out: int = 1,\n        causal: bool = False,\n        n_transformer_layers: list = [0, 0, 0, 0],\n        transformer_general_config=None,\n    ):\n        super().__init__()\n        conv_class = CausalWNConv1d if causal else WNConv1d\n        # Add first conv layer\n        layers = [conv_class(input_channel, channels, kernel_size=7, padding=3)]\n\n        # Add upsampling + MRF blocks\n        for i, (stride, n_t_layer) in enumerate(zip(rates, n_transformer_layers)):\n            input_dim = channels // 2**i\n            output_dim = channels // 2 ** (i + 1)\n            layers += [\n                DecoderBlock(\n                    input_dim,\n                    output_dim,\n                    stride,\n                    causal=causal,\n                    n_t_layer=n_t_layer,\n                    transformer_general_config=transformer_general_config,\n                )\n            ]\n\n        # Add final conv layer\n        layers += [\n            Snake1d(output_dim),\n            conv_class(output_dim, d_out, kernel_size=7, padding=3),\n            nn.Tanh(),\n        ]\n\n        self.model = nn.Sequential(*layers)\n\n    def forward(self, x):\n        return self.model(x)\n\n\nclass DAC(BaseModel, CodecMixin):\n    def __init__(\n        self,\n        encoder_dim: int = 64,\n        encoder_rates: List[int] = [2, 4, 8, 8],\n        latent_dim: int = None,\n        decoder_dim: int = 1536,\n        decoder_rates: List[int] = [8, 8, 4, 2],\n        quantizer: torch.nn.Module = None,\n        sample_rate: int = 44100,\n        causal: bool = True,\n        encoder_transformer_layers: List[int] = [0, 0, 0, 0],\n        decoder_transformer_layers: List[int] = [0, 0, 0, 0],\n        overwrite_decoder: torch.nn.Module = None,\n        transformer_general_config=None,\n    ):\n        super().__init__()\n\n        self.encoder_dim = encoder_dim\n        self.encoder_rates = encoder_rates\n        self.decoder_dim = decoder_dim\n        self.decoder_rates = decoder_rates\n        self.sample_rate = sample_rate\n\n        if latent_dim is None:\n            latent_dim = encoder_dim * (2 ** len(encoder_rates))\n\n        self.latent_dim = latent_dim\n\n        self.hop_length = np.prod(encoder_rates)\n        self.encoder = Encoder(\n            encoder_dim,\n            encoder_rates,\n            latent_dim,\n            causal=causal,\n            n_transformer_layers=encoder_transformer_layers,\n            transformer_general_config=transformer_general_config,\n        )\n\n        self.quantizer = quantizer\n\n        if overwrite_decoder is not None:\n            self.decoder = overwrite_decoder\n        else:\n            self.decoder = Decoder(\n                latent_dim,\n                decoder_dim,\n                decoder_rates,\n                causal=causal,\n                n_transformer_layers=decoder_transformer_layers,\n                transformer_general_config=transformer_general_config,\n            )\n        self.sample_rate = sample_rate\n        self.apply(init_weights)\n\n        self.delay = self.get_delay()\n\n        self.frame_length = self.hop_length * 4\n\n    def preprocess(self, audio_data, sample_rate):\n        if sample_rate is None:\n            sample_rate = self.sample_rate\n        assert sample_rate == self.sample_rate\n\n        length = audio_data.shape[-1]\n        right_pad = math.ceil(length / self.hop_length) * self.hop_length - length\n        audio_data = nn.functional.pad(audio_data, (0, right_pad))\n\n        return audio_data\n\n    def encode(\n        self,\n        audio_data: torch.Tensor,\n        audio_lengths: torch.Tensor = None,\n        n_quantizers: int = None,\n        **kwargs,\n    ):\n        \"\"\"Encode given audio data and return quantized latent codes\n\n        Parameters\n        ----------\n        audio_data : Tensor[B x T]\n            Audio data to encode\n        n_quantizers : int, optional\n            Number of quantizers to use, by default None\n            If None, all quantizers are used.\n\n        Returns\n        -------\n        dict\n            A dictionary with the following keys:\n            \"z\" : Tensor[B x D x T]\n                Quantized continuous representation of input\n            \"codes\" : Tensor[B x N x T]\n                Codebook indices for each codebook\n                (quantized discrete representation of input)\n            \"latents\" : Tensor[B x N*D x T]\n                Projected latents (continuous representation of input before quantization)\n            \"vq/commitment_loss\" : Tensor[1]\n                Commitment loss to train encoder to predict vectors closer to codebook\n                entries\n            \"vq/codebook_loss\" : Tensor[1]\n                Codebook loss to update the codebook\n            \"length\" : int\n                Number of samples in input audio\n        \"\"\"\n        # pad to multiple of self.frame_length\n        if audio_data.ndim == 2:\n            audio_data = audio_data.unsqueeze(1)\n        length = audio_data.shape[-1]\n        right_pad = math.ceil(length / self.frame_length) * self.frame_length - length\n        audio_data = nn.functional.pad(audio_data, (0, right_pad))\n        if audio_lengths is None:\n            audio_lengths = torch.LongTensor([length + right_pad]).to(audio_data.device)\n\n        z = self.encoder(audio_data)\n        vq_results = self.quantizer(z, n_quantizers, **kwargs)\n        indices = vq_results.codes\n        indices_lens = torch.ceil(audio_lengths / self.frame_length).long()\n        return indices, indices_lens\n\n    def from_indices(self, indices: torch.Tensor):\n        z = self.quantizer.decode(indices)\n        return self.decoder(z)\n\n    def decode(self, z: torch.Tensor):\n        \"\"\"Decode given latent codes and return audio data\n\n        Parameters\n        ----------\n        z : Tensor[B x D x T]\n            Quantized continuous representation of input\n        length : int, optional\n            Number of samples in output audio, by default None\n\n        Returns\n        -------\n        dict\n            A dictionary with the following keys:\n            \"audio\" : Tensor[B x 1 x length]\n                Decoded audio data.\n        \"\"\"\n        return self.decoder(z)\n\n    def forward(\n        self,\n        audio_data: torch.Tensor,\n        template: torch.Tensor = None,\n        mask: torch.Tensor = None,\n        sample_rate: int = None,\n        n_quantizers: int = None,\n        **kwargs,\n    ):\n        \"\"\"Model forward pass\n\n        Parameters\n        ----------\n        audio_data : Tensor[B x 1 x T]\n            Audio data to encode\n        sample_rate : int, optional\n            Sample rate of audio data in Hz, by default None\n            If None, defaults to `self.sample_rate`\n        n_quantizers : int, optional\n            Number of quantizers to use, by default None.\n            If None, all quantizers are used.\n\n        Returns\n        -------\n        dict\n            A dictionary with the following keys:\n            \"z\" : Tensor[B x D x T]\n                Quantized continuous representation of input\n            \"codes\" : Tensor[B x N x T]\n                Codebook indices for each codebook\n                (quantized discrete representation of input)\n            \"latents\" : Tensor[B x N*D x T]\n                Projected latents (continuous representation of input before quantization)\n            \"vq/commitment_loss\" : Tensor[1]\n                Commitment loss to train encoder to predict vectors closer to codebook\n                entries\n            \"vq/codebook_loss\" : Tensor[1]\n                Codebook loss to update the codebook\n            \"length\" : int\n                Number of samples in input audio\n            \"audio\" : Tensor[B x 1 x length]\n                Decoded audio data.\n        \"\"\"\n        length = audio_data.shape[-1]\n        audio_data = self.preprocess(audio_data, sample_rate)\n        vq_results = self.encode(audio_data, n_quantizers, **kwargs)\n        z = vq_results[0] if isinstance(vq_results, tuple) else vq_results.z\n        x = self.decode(z)\n        return x[..., :length], vq_results\n\n\nif __name__ == \"__main__\":\n    import hydra\n    import numpy as np\n    import soundfile as sf\n    import torch\n    from omegaconf import OmegaConf\n\n    # 配置路径\n    config_path = \"fish_speech/configs/modded_dac_vq.yaml\"\n    checkpoint_path = \"checkpoints/s2-pro/codec.pth\"\n    codes_path = \"./output/codes_0.npy\"  # 你的 codes 文件路径\n    output_path = \"reconstructed_from_codes.wav\"\n    sample_rate = 44100  # 请确保采样率与模型训练时一致\n\n    with torch.inference_mode():\n        # 1. 初始化模型\n        model = hydra.utils.instantiate(OmegaConf.load(config_path))\n        new_sd = torch.load(checkpoint_path, map_location=\"cpu\")\n        model.load_state_dict(new_sd, strict=False)\n        model.cuda()\n        model.eval()\n\n        # 2. 加载外部 codes (.npy)\n        # 预期 shape 通常为 [num_codebooks, seq_len] 或 [1, num_codebooks, seq_len]\n        codes_np = np.load(codes_path)\n        codes_tensor = torch.from_numpy(codes_np).to(torch.long).cuda()\n\n        # 如果 codes 没有 batch 维度，增加一个维度 [1, num_codebooks, seq_len]\n        if len(codes_tensor.shape) == 2:\n            codes_tensor = codes_tensor.unsqueeze(0)\n\n        print(f\"Loaded codes shape: {codes_tensor.shape}\")\n\n        # 3. 直接从 codes 重建音频 (Decoding)\n        # 注意：fish_speech 的 model.from_indices 通常接受的输入是 LongTensor\n        fake_audio = model.from_indices(codes_tensor)\n\n        # 4. 后处理与保存\n        # fake_audio 形状通常为 [B, C, T]\n        audio_np = fake_audio.squeeze().cpu().numpy()\n\n        # 如果是多声道，转置为 soundfile 要求的 (samples, channels)\n        if len(audio_np.shape) == 2:\n            audio_np = audio_np.T\n\n        sf.write(output_path, audio_np, sample_rate)\n        print(f\"重建完成。音频已保存至: {output_path}\")\n"
  },
  {
    "path": "fish_speech/models/dac/rvq.py",
    "content": "import math\nimport typing as tp\nfrom dataclasses import dataclass\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom dac.nn.quantize import ResidualVectorQuantize\nfrom torch.nn.utils.parametrizations import weight_norm\nfrom torch.nn.utils.parametrize import remove_parametrizations\n\n\ndef unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):\n    \"\"\"Remove padding from x, handling properly zero padding. Only for 1d!\"\"\"\n    padding_left, padding_right = paddings\n    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)\n    assert (padding_left + padding_right) <= x.shape[-1]\n    end = x.shape[-1] - padding_right\n    return x[..., padding_left:end]\n\n\ndef get_extra_padding_for_conv1d(\n    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0\n) -> int:\n    \"\"\"See `pad_for_conv1d`.\"\"\"\n    length = x.shape[-1]\n    n_frames = (length - kernel_size + padding_total) / stride + 1\n    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)\n    return ideal_length - length\n\n\ndef pad1d(\n    x: torch.Tensor,\n    paddings: tp.Tuple[int, int],\n    mode: str = \"zeros\",\n    value: float = 0.0,\n):\n    \"\"\"Tiny wrapper around F.pad, just to allow for reflect padding on small input.\n    If this is the case, we insert extra 0 padding to the right\n    before the reflection happen.\n    \"\"\"\n    length = x.shape[-1]\n    padding_left, padding_right = paddings\n    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)\n    if mode == \"reflect\":\n        max_pad = max(padding_left, padding_right)\n        extra_pad = 0\n        if length <= max_pad:\n            extra_pad = max_pad - length + 1\n            x = F.pad(x, (0, extra_pad))\n        padded = F.pad(x, paddings, mode, value)\n        end = padded.shape[-1] - extra_pad\n        return padded[..., :end]\n    else:\n        return F.pad(x, paddings, mode, value)\n\n\nclass CausalConvNet(nn.Module):\n    def __init__(\n        self,\n        in_channels,\n        out_channels,\n        kernel_size,\n        dilation=1,\n        stride=1,\n        groups=1,\n        padding=None,\n    ):\n        super(CausalConvNet, self).__init__()\n        self.conv = nn.Conv1d(\n            in_channels,\n            out_channels,\n            kernel_size,\n            stride=stride,\n            dilation=dilation,\n            groups=groups,\n        )\n        self.stride = stride\n        self.kernel_size = (kernel_size - 1) * dilation + 1\n        self.dilation = dilation\n        self.padding = self.kernel_size - self.stride\n\n    def forward(self, x):\n        pad = self.padding\n        extra_padding = get_extra_padding_for_conv1d(\n            x, self.kernel_size, self.stride, pad\n        )\n        x = pad1d(x, (pad, extra_padding), mode=\"constant\", value=0)\n        return self.conv(x).contiguous()\n\n    def weight_norm(self, name=\"weight\", dim=0):\n        self.conv = weight_norm(self.conv, name=name, dim=dim)\n        return self\n\n    def remove_weight_norm(self):\n        self.conv = remove_parametrizations(self.conv)\n        return self\n\n\nclass CausalTransConvNet(nn.Module):\n    def __init__(\n        self, in_channels, out_channels, kernel_size, dilation=1, stride=1, padding=None\n    ):\n        super(CausalTransConvNet, self).__init__()\n        self.conv = nn.ConvTranspose1d(\n            in_channels, out_channels, kernel_size, stride=stride, dilation=dilation\n        )\n        self.stride = stride\n        self.kernel_size = kernel_size\n\n    def forward(self, x):\n        x = self.conv(x)\n        pad = self.kernel_size - self.stride\n        padding_right = math.ceil(pad)\n        padding_left = pad - padding_right\n        x = unpad1d(x, (padding_left, padding_right))\n        return x.contiguous()\n\n    def weight_norm(self, name=\"weight\", dim=0):\n        self.conv = weight_norm(self.conv, name=name, dim=dim)\n        return self\n\n    def remove_weight_norm(self):\n        self.conv = remove_parametrizations(self.conv)\n        return self\n\n\n# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py\nclass ConvNeXtBlock(nn.Module):\n    r\"\"\"ConvNeXt Block. There are two equivalent implementations:\n    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)\n    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back\n    We use (2) as we find it slightly faster in PyTorch\n    Args:\n        dim (int): Number of input channels.\n        drop_path (float): Stochastic depth rate. Default: 0.0\n        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.\n        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.\n        kernel_size (int): Kernel size for depthwise conv. Default: 7.\n        dilation (int): Dilation for depthwise conv. Default: 1.\n    \"\"\"  # noqa: E501\n\n    def __init__(\n        self,\n        dim: int,\n        layer_scale_init_value: float = 1e-6,\n        mlp_ratio: float = 4.0,\n        kernel_size: int = 7,\n        dilation: int = 1,\n    ):\n        super().__init__()\n        convnet_type = CausalConvNet\n        self.dwconv = convnet_type(\n            dim,\n            dim,\n            kernel_size=kernel_size,\n            # padding=int(dilation * (kernel_size - 1) / 2),\n            groups=dim,\n            dilation=dilation,\n        )  # depthwise conv\n        self.norm = nn.LayerNorm(dim, eps=1e-6)\n        self.pwconv1 = nn.Linear(\n            dim, int(mlp_ratio * dim)\n        )  # pointwise/1x1 convs, implemented with linear layers\n        self.act = nn.GELU()\n        self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)\n        self.gamma = (\n            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)\n            if layer_scale_init_value > 0\n            else None\n        )\n\n    def forward(self, x, apply_residual: bool = True):\n        input = x\n\n        x = self.dwconv(x)\n        x = x.permute(0, 2, 1)  # (N, C, L) -> (N, L, C)\n        x = self.norm(x)\n        x = self.pwconv1(x)\n        x = self.act(x)\n        x = self.pwconv2(x)\n\n        if self.gamma is not None:\n            x = self.gamma * x\n\n        x = x.permute(0, 2, 1)  # (N, L, C) -> (N, C, L)\n\n        if apply_residual:\n            x = input + x\n\n        return x\n\n\n@dataclass\nclass VQResult:\n    z: torch.Tensor\n    codes: torch.Tensor\n    latents: torch.Tensor\n    codebook_loss: torch.Tensor\n    commitment_loss: torch.Tensor\n    semantic_distill_z: torch.Tensor | None = None\n\n\nclass DownsampleResidualVectorQuantize(nn.Module):\n    def __init__(\n        self,\n        input_dim: int = 1024,\n        n_codebooks: int = 9,\n        codebook_dim: int = 8,\n        quantizer_dropout: float = 0.5,\n        codebook_size: int = 1024,\n        semantic_codebook_size: int = 4096,\n        downsample_factor: tuple[int] = (2, 2),\n        downsample_dims: tuple[int] | None = None,\n        pre_module: nn.Module | None = None,\n        post_module: nn.Module | None = None,\n        semantic_predictor_module: nn.Module | None = None,\n    ):\n        super().__init__()\n\n        if downsample_dims is None:\n            downsample_dims = [input_dim for _ in range(len(downsample_factor))]\n\n        all_dims = (input_dim,) + tuple(downsample_dims)\n\n        self.semantic_quantizer = ResidualVectorQuantize(\n            input_dim=input_dim,\n            n_codebooks=1,\n            codebook_size=semantic_codebook_size,\n            codebook_dim=codebook_dim,\n            quantizer_dropout=0.0,\n        )\n\n        self.quantizer = ResidualVectorQuantize(\n            input_dim=input_dim,\n            n_codebooks=n_codebooks,\n            codebook_size=codebook_size,\n            codebook_dim=codebook_dim,\n            quantizer_dropout=quantizer_dropout,\n        )\n\n        self.downsample_factor = downsample_factor\n        self.downsample_dims = downsample_dims\n\n        convnet_type = CausalConvNet\n        transconvnet_type = CausalTransConvNet\n\n        self.downsample = nn.Sequential(\n            *[\n                nn.Sequential(\n                    convnet_type(\n                        all_dims[idx],\n                        all_dims[idx + 1],\n                        kernel_size=factor,\n                        stride=factor,\n                    ),\n                    ConvNeXtBlock(dim=all_dims[idx + 1]),\n                )\n                for idx, factor in enumerate(downsample_factor)\n            ]\n        )\n\n        self.upsample = nn.Sequential(\n            *[\n                nn.Sequential(\n                    transconvnet_type(\n                        all_dims[idx + 1],\n                        all_dims[idx],\n                        kernel_size=factor,\n                        stride=factor,\n                    ),\n                    ConvNeXtBlock(dim=all_dims[idx]),\n                )\n                for idx, factor in reversed(list(enumerate(downsample_factor)))\n            ]\n        )\n        self.apply(self._init_weights)\n        self.pre_module = (\n            pre_module if pre_module is not None else nn.Identity()\n        )  # leave for transformer, LSTM or Mamba or something else\n        self.post_module = post_module if post_module is not None else nn.Identity()\n        self.semantic_predictor_module = (\n            semantic_predictor_module\n            if semantic_predictor_module is not None\n            else nn.Identity()\n        )\n\n    def _init_weights(self, m):\n        if isinstance(m, (nn.Conv1d, nn.Linear)):\n            nn.init.trunc_normal_(m.weight, std=0.02)\n            nn.init.constant_(m.bias, 0)\n\n    def forward(\n        self, z, n_quantizers: int = None, semantic_len: torch.Tensor = None, **kwargs\n    ):\n        # z: (B, D, T)\n        original_shape = z.shape\n        if semantic_len is None:\n            semantic_len = torch.LongTensor([z.shape[-1]])\n        z = self.downsample(z)\n        z = self.pre_module(z)  # B, T, D\n        (\n            semantic_z,\n            semantic_codes,\n            semantic_latents,\n            semantic_commitment_loss,\n            semantic_codebook_loss,\n        ) = self.semantic_quantizer(z)\n        residual_z = z - semantic_z\n        residual_z, codes, latents, commitment_loss, codebook_loss = self.quantizer(\n            residual_z, n_quantizers=n_quantizers\n        )\n        z = semantic_z + residual_z\n        commitment_loss = commitment_loss + semantic_commitment_loss\n        codebook_loss = codebook_loss + semantic_codebook_loss\n        codes = torch.cat([semantic_codes, codes], dim=1)\n        latents = torch.cat([semantic_latents, latents], dim=1)\n        z = self.post_module(z)\n        z = self.upsample(z)\n        # z: (B, D, T)\n\n        # semantic distillation (disabled here since only used in training)\n        # semantic_distill_z = self.semantic_predictor_module(semantic_z, semantic_len).mT  # wav2vec target is B, T, D\n\n        # Pad or crop z to match original shape\n        diff = original_shape[-1] - z.shape[-1]\n        right = 0\n        left = abs(diff) - right\n\n        if diff > 0:\n            z = F.pad(z, (left, right))\n        elif diff < 0:\n            z = z[..., left:]\n\n        results = VQResult(\n            z=z,\n            codes=codes,\n            latents=latents,\n            commitment_loss=commitment_loss,\n            codebook_loss=codebook_loss,\n        )\n\n        return results\n\n    # def encode(self, z):\n    #     z = self.downsample(z)\n    #     z = self.pre_module(z)\n    #     _, indices, _, _, _ = self.quantizer(z.mT)\n    #     indices = rearrange(indices, \"g b l r -> b (g r) l\")\n    #     return indices\n    #\n    def decode(self, indices: torch.Tensor):\n        # indices = rearrange(indices, \"b (g r) l -> g b l r\", g=self.residual_fsq.groups)\n        indices[:, 0] = torch.clamp(\n            indices[:, 0], max=self.semantic_quantizer.codebook_size - 1\n        )\n        indices[:, 1:] = torch.clamp(\n            indices[:, 1:], max=self.quantizer.codebook_size - 1\n        )\n\n        z_q_semantic = self.semantic_quantizer.from_codes(indices[:, :1])[0]\n        z_q_residual = self.quantizer.from_codes(indices[:, 1:])[0]\n        z_q = z_q_semantic + z_q_residual\n        z_q = self.post_module(z_q)\n        z_q = self.upsample(z_q)\n        return z_q\n\n    # def from_latents(self, latents: torch.Tensor):\n    #     z_q, z_p, codes = super().from_latents(latents)\n    #     z_q = self.upsample(z_q)\n    #     return z_q, z_p, codes\n\n\nif __name__ == \"__main__\":\n    rvq = DownsampleResidualVectorQuantize(\n        input_dim=512,\n        n_codebooks=8,\n        codebook_dim=8,\n        codebook_size=1024,\n        quantizer_dropout=0.5,\n        downsample_factor=[2, 2],\n    )\n    rvq.eval()\n    x = torch.randn(2, 512, 442)\n\n    result = rvq(x)\n    print(rvq)\n    print(result.latents.shape, result.codes.shape, result.z.shape)\n\n    # y = rvq.from_codes(result.codes)\n    # print(y[0].shape)\n\n    # y = rvq.from_latents(\n\n    result1 = rvq(x[:, :, :40])\n    print(result1.latents.shape, result1.codes.shape, result1.z.shape)\n\n    assert torch.allclose(result.z[:, :, :40], result1.z, atol=1e-8)\n    print(\"Success\")\n"
  },
  {
    "path": "fish_speech/models/text2semantic/__init__.py",
    "content": ""
  },
  {
    "path": "fish_speech/models/text2semantic/inference.py",
    "content": "import os\nimport queue\nimport re\nimport threading\nimport time\nimport traceback\nfrom copy import deepcopy\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Callable, Literal, Optional, Tuple, Union\n\nimport click\nimport numpy as np\nimport torch\nimport torch._inductor.config\nfrom loguru import logger\nfrom tqdm import tqdm\n\nfrom fish_speech.content_sequence import (\n    TextPart,\n    VQPart,\n)\nfrom fish_speech.conversation import Conversation, Message\nfrom fish_speech.tokenizer import IM_END_TOKEN\n\nos.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\ntorch._inductor.config.coordinate_descent_tuning = True\ntorch._inductor.config.triton.unique_kernel_names = True\n\nif hasattr(torch._inductor.config, \"fx_graph_cache\"):\n    torch._inductor.config.fx_graph_cache = True\n\n\nfrom torch.nn.attention import SDPBackend, sdpa_kernel\n\nfrom fish_speech.models.text2semantic.llama import (\n    BaseTransformer,\n    DualARTransformer,\n    NaiveTransformer,\n)\n\n\ndef multinomial_sample_one_no_sync(probs_sort):\n    q = torch.rand_like(probs_sort)\n    q = -torch.log(q)\n    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)\n\n\nRAS_WIN_SIZE = 10  # window for Repetition Aware Sampling\nRAS_HIGH_TEMP = 1.0\nRAS_HIGH_TOP_P = 0.9\n\n\ndef logits_to_probs(\n    logits,\n    temperature: torch.Tensor,\n    top_p: torch.Tensor,\n    top_k: int,  # 注意: 我看到你传进来的是 int，这很关键\n) -> torch.Tensor:\n    sorted_logits, sorted_indices = torch.sort(logits, descending=True)\n    cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)\n\n    indices = torch.arange(sorted_logits.shape[-1], device=sorted_logits.device)\n    top_k_mask = indices >= top_k\n    sorted_indices_to_remove = (cum_probs > top_p) | top_k_mask\n    sorted_indices_to_remove[0] = False  # 单元素修改问题不大，或者写成 | (indices != 0)\n\n    indices_to_remove = sorted_indices_to_remove.scatter(\n        dim=-1, index=sorted_indices, src=sorted_indices_to_remove\n    )\n    logits = torch.where(\n        indices_to_remove, float(\"-Inf\"), logits\n    )  # 同样替换 masked_fill_ 为 torch.where\n    logits = logits / torch.clip(temperature, min=1e-5)\n\n    probs = torch.nn.functional.softmax(logits, dim=-1)\n    return probs\n\n\ndef sample(\n    logits,\n    temperature: torch.Tensor,\n    top_p: torch.Tensor,\n    top_k: int,\n) -> Tuple[torch.Tensor, torch.Tensor]:\n    probs = logits_to_probs(\n        logits=logits[0, -1],\n        temperature=temperature,\n        top_p=top_p,\n        top_k=top_k,\n    )\n    idx_next = multinomial_sample_one_no_sync(probs)\n    return idx_next, probs\n\n\ndef decode_one_token_ar(\n    model: DualARTransformer,\n    x: torch.Tensor,\n    input_pos: torch.Tensor,\n    temperature: torch.Tensor,\n    top_p: torch.Tensor,\n    top_k: int,\n    semantic_logit_bias: torch.Tensor,\n    audio_masks: torch.Tensor,\n    audio_parts: torch.Tensor,\n    previous_tokens: Optional[torch.Tensor] = None,\n) -> torch.Tensor:\n    forward_result = model.forward_generate(\n        x,\n        input_pos,\n        audio_masks=audio_masks,\n        audio_parts=audio_parts,\n    )\n    logits = forward_result.logits  # (1, 1, vocab_size)\n    hidden_states = forward_result.hidden_states\n\n    # Apply constrained decoding: only allow semantic tokens + im_end\n    biased_logits = logits + semantic_logit_bias\n\n    # Normal sample\n    main_token_normal = sample(\n        biased_logits, temperature=temperature, top_p=top_p, top_k=top_k\n    )[0]\n\n    # RAS: also sample with high temp to use as fallback if token repeats\n    high_temp = torch.tensor(\n        RAS_HIGH_TEMP, device=temperature.device, dtype=temperature.dtype\n    )\n    high_top_p = torch.tensor(RAS_HIGH_TOP_P, device=top_p.device, dtype=top_p.dtype)\n    main_token_high = sample(\n        biased_logits, temperature=high_temp, top_p=high_top_p, top_k=top_k\n    )[0]\n\n    # Use high-temp sample if: token is semantic AND token is in previous window\n    if previous_tokens is not None:\n        in_window = (previous_tokens[0] == main_token_normal).any()\n        # Use tensor ops (&, torch.where) instead of Python (and, if) — torch.compile requires no data-dependent branching\n        is_semantic = (main_token_normal >= model.config.semantic_begin_id) & (\n            main_token_normal <= model.config.semantic_end_id\n        )\n        should_use_high = in_window & is_semantic\n        main_token_normal = torch.where(\n            should_use_high, main_token_high, main_token_normal\n        )\n\n    codebooks = [main_token_normal]\n\n    input_pos = torch.tensor([0], device=hidden_states.device, dtype=torch.long)\n    model.forward_generate_fast(hidden_states, input_pos)\n\n    a = codebooks[0] - model.config.semantic_begin_id\n    a = torch.clamp(a, min=0, max=model.config.codebook_size - 1)\n\n    hidden_states = model.fast_embeddings(a)\n    codebooks.append(a)\n\n    for codebook_idx in range(1, model.config.num_codebooks):\n        input_pos = torch.tensor(\n            [codebook_idx], device=hidden_states.device, dtype=torch.long\n        )\n        logits = model.forward_generate_fast(hidden_states, input_pos)\n\n        short_logits = logits  # DualAR predicts config.codebook_size number of tokens\n\n        # Convert logits to probs (no constrain for fast codebooks)\n        a = sample(\n            short_logits,\n            temperature=temperature,\n            top_p=top_p,\n            top_k=top_k,\n        )[0]\n\n        hidden_states = model.fast_embeddings(a)\n        codebooks.append(a)\n\n    codebooks = torch.stack(codebooks, dim=1)\n\n    # Only delete references, let Python GC handle cleanup\n    del logits, hidden_states, forward_result\n\n    return codebooks.T\n\n\ndef decode_n_tokens(\n    model: DualARTransformer,\n    cur_token: torch.Tensor,\n    input_pos: torch.Tensor,\n    num_new_tokens: int,\n    temperature: torch.Tensor,\n    top_p: torch.Tensor,\n    top_k: int,\n    semantic_logit_bias: torch.Tensor,\n    audio_masks: torch.Tensor,\n    audio_parts: torch.Tensor,\n    decode_one_token=decode_one_token_ar,\n):\n    # Rolling window for RAS (Repetition Aware Sampling)\n    previous_tokens = torch.zeros(\n        (model.config.num_codebooks + 1, RAS_WIN_SIZE),\n        dtype=torch.int,\n        device=cur_token.device,\n    )\n    # Accumulate all generated tokens (the actual output)\n    new_tokens = []\n\n    # [MODIFIED] Pre-fetch ID for efficiency loop\n    im_end_id = model.tokenizer.get_token_id(IM_END_TOKEN)\n\n    for i in tqdm(range(num_new_tokens)):\n        with sdpa_kernel(SDPBackend.MATH):\n            next_token = decode_one_token(\n                model=model,\n                x=cur_token,\n                input_pos=input_pos,\n                previous_tokens=previous_tokens,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n                semantic_logit_bias=semantic_logit_bias,\n                audio_masks=audio_masks,\n                audio_parts=audio_parts,\n            ).clone()\n\n        input_pos += 1\n        cur_token = next_token.view(1, model.config.num_codebooks + 1, -1)\n        # Roll RAS window left and insert new token at end\n        previous_tokens = previous_tokens.roll(-1, dims=1)\n        previous_tokens[:, -1] = next_token.view(model.config.num_codebooks + 1, -1)[\n            :, 0\n        ]\n        new_tokens.append(next_token)\n\n        if cur_token[0, 0, -1] == im_end_id:\n            break\n\n    del cur_token\n\n    return torch.cat(new_tokens, dim=1)\n\n\n@torch.no_grad()\n@torch.inference_mode()\ndef generate(\n    *,\n    model: DualARTransformer,\n    prompt: torch.Tensor,\n    max_new_tokens: int,\n    audio_masks: torch.Tensor,\n    audio_parts: torch.Tensor,\n    decode_one_token=decode_one_token_ar,\n    num_samples: int = 1,\n    **sampling_kwargs,\n):\n    \"\"\"\n    Takes a conditioning sequence (prompt) as input and continues to generate as many tokens as requested.\n    \"\"\"\n\n    # create an empty tensor of the expected final shape and fill in the current tokens\n    T = prompt.size(1)\n    prompt = prompt[None].repeat(num_samples, 1, 1)\n\n    if T >= model.config.max_seq_len:\n        raise ValueError(\n            f\"Input sequence length {T} exceeds max_seq_len {model.config.max_seq_len}\"\n        )\n\n    if max_new_tokens:\n        if T + max_new_tokens > model.config.max_seq_len:\n            max_new_tokens = model.config.max_seq_len - T\n\n        T_new = T + max_new_tokens\n    else:\n        T_new = model.config.max_seq_len\n        max_new_tokens = T_new - T\n\n    device = prompt.device\n    dtype = next(\n        model.parameters()\n    ).dtype  # model weight dtype (bfloat16), NOT prompt dtype (int32)\n\n    # Critical fix: Only set up cache on first run or when necessary\n    if not hasattr(model, \"_cache_setup_done\") or not model._cache_setup_done:\n        with torch.device(device):\n            model.setup_caches(\n                max_batch_size=1,  # Fixed to 1, avoid dynamic changes\n                max_seq_len=model.config.max_seq_len,\n                dtype=next(model.parameters()).dtype,\n            )\n        model._cache_setup_done = True\n\n    codebook_dim = 1 + model.config.num_codebooks\n\n    # Create new tensor each time, but try to reuse memory\n    input_pos = torch.arange(0, T, device=device, dtype=torch.long)\n    empty = torch.empty(\n        (codebook_dim, model.config.max_seq_len), dtype=prompt.dtype, device=device\n    )\n    empty[:, :T] = prompt\n    seq = empty\n\n    temp_val = sampling_kwargs.get(\"temperature\", 1.0)\n    top_p_val = sampling_kwargs.get(\"top_p\", 0.9)\n    top_k_val = sampling_kwargs.get(\"top_k\", 30)\n\n    temperature = torch.tensor(temp_val, device=device, dtype=dtype)\n    top_p = torch.tensor(top_p_val, device=device, dtype=dtype)\n\n    # Build semantic logit bias: 0 for semantic tokens + im_end, -inf for all others\n    vocab_size = model.config.vocab_size\n    semantic_logit_bias = torch.full(\n        (1, 1, vocab_size), float(\"-inf\"), device=device, dtype=dtype\n    )\n\n    # [MODIFIED] Use config for semantic range\n    semantic_logit_bias[\n        0, 0, model.config.semantic_begin_id : model.config.semantic_end_id + 1\n    ] = 0.0\n\n    # [MODIFIED] Use tokenizer.get_token_id (Wrapper method)\n    semantic_logit_bias[0, 0, model.tokenizer.get_token_id(IM_END_TOKEN)] = 0.0\n\n    prefill_decode = decode_one_token_ar\n\n    first_token = prefill_decode(\n        model,\n        prompt.view(1, codebook_dim, -1),\n        input_pos,\n        temperature,\n        top_p,\n        top_k_val,\n        semantic_logit_bias,\n        audio_masks,\n        audio_parts,\n    )\n    seq[:, T : T + 1] = first_token\n\n    # Recreate input_pos\n    input_pos = torch.tensor([T], device=device, dtype=torch.int)\n\n    x = decode_n_tokens(\n        model,\n        first_token.view(1, codebook_dim, -1),\n        input_pos,\n        max_new_tokens - 1,\n        temperature=temperature,\n        top_p=top_p,\n        top_k=top_k_val,\n        semantic_logit_bias=semantic_logit_bias,\n        audio_masks=audio_masks,\n        audio_parts=audio_parts,\n        decode_one_token=decode_one_token,\n    )\n    seq = seq[:, : T + 1 + x.size(1)]\n    seq[:, T + 1 :] = x\n\n    # Clean up temporary variables\n    del first_token, x, prompt, empty, input_pos\n\n    return seq\n\n\ndef init_model(checkpoint_path, device, precision, compile=False):\n    model = DualARTransformer.from_pretrained(checkpoint_path, load_weights=True)\n\n    model = model.to(device=device, dtype=precision)\n    logger.info(f\"Restored model from checkpoint\")\n\n    if isinstance(model, DualARTransformer):\n        decode_one_token = decode_one_token_ar\n        # prefill_n_tokens = decode_one_token_ar\n        logger.info(\"Using DualARTransformer\")\n    else:\n        raise ValueError(\"Unsupported model type\")\n\n    # Pre-create fixed parameter tensors to avoid runtime creation\n    model.fixed_temperature = torch.tensor(0.7, device=device, dtype=torch.float)\n    model.fixed_top_p = torch.tensor(0.7, device=device, dtype=torch.float)\n    model.fixed_repetition_penalty = torch.tensor(1.5, device=device, dtype=torch.float)\n\n    # Mark whether cache has been initialized\n    model._cache_setup_done = False\n\n    if compile:\n        logger.info(\"Compiling function...\")\n        decode_one_token = torch.compile(\n            decode_one_token,\n            backend=\"inductor\" if torch.cuda.is_available() else \"aot_eager\",\n            mode=\"default\" if torch.cuda.is_available() else None,\n            fullgraph=True,\n        )\n\n    return model.eval(), decode_one_token\n\n\n@torch.inference_mode()\ndef load_codec_model(codec_checkpoint_path, device, precision=torch.bfloat16):\n    \"\"\"Load the DAC codec model for audio encoding/decoding.\"\"\"\n    from hydra.utils import instantiate\n    from omegaconf import OmegaConf\n\n    config_path = Path(__file__).parent.parent.parent / \"configs\" / \"modded_dac_vq.yaml\"\n    cfg = OmegaConf.load(str(config_path))\n    codec = instantiate(cfg)\n\n    state_dict = torch.load(codec_checkpoint_path, map_location=\"cpu\")\n    if \"state_dict\" in state_dict:\n        state_dict = state_dict[\"state_dict\"]\n    if any(\"generator\" in k for k in state_dict):\n        state_dict = {\n            k.replace(\"generator.\", \"\"): v\n            for k, v in state_dict.items()\n            if \"generator.\" in k\n        }\n    codec.load_state_dict(state_dict, strict=False)\n    codec.eval()\n    codec.to(device=device, dtype=precision)\n    return codec\n\n\n@torch.inference_mode()\ndef encode_audio(audio_path, codec, device):\n    \"\"\"Encode an audio file to VQ codes.\"\"\"\n    import torchaudio\n\n    wav, sr = torchaudio.load(str(audio_path))\n    if wav.shape[0] > 1:\n        wav = wav.mean(dim=0, keepdim=True)\n    wav = torchaudio.functional.resample(wav.to(device), sr, codec.sample_rate)[0]\n\n    # Match codec model dtype (e.g. bfloat16)\n    model_dtype = next(codec.parameters()).dtype\n    audios = wav[None, None].to(dtype=model_dtype)  # (1, 1, T)\n    audio_lengths = torch.tensor([len(wav)], device=device, dtype=torch.long)\n\n    indices, feature_lengths = codec.encode(audios, audio_lengths)\n    return indices[0, :, : feature_lengths[0]]  # (num_codebooks, T)\n\n\n@torch.inference_mode()\ndef decode_to_audio(codes, codec):\n    \"\"\"Decode VQ codes to audio waveform.\"\"\"\n    # codes: (num_codebooks, T) -> (1, num_codebooks, T)\n    audio = codec.from_indices(codes[None])\n    return audio[0, 0]  # (T,) mono waveform\n\n\n@dataclass\nclass GenerateResponse:\n    action: Literal[\"sample\", \"next\"]\n    codes: Optional[torch.Tensor] = None\n    text: Optional[str] = None\n\n\ndef split_text_by_speaker(text: str) -> list[str]:\n    \"\"\"\n    Split text into turns based on <|speaker:X|> tags.\n\n    Args:\n        text: The full text with speaker tags\n\n    Returns:\n        List of speaker turns, each starting with <|speaker:X|>\n    \"\"\"\n    pattern = r\"(<\\|speaker:\\d+\\|>)\"\n    parts = re.split(pattern, text)\n\n    turns = []\n    i = 0\n    while i < len(parts):\n        part = parts[i].strip()\n        if re.match(pattern, part):\n            if i + 1 < len(parts):\n                turn = part + parts[i + 1]\n                turns.append(turn.strip())\n                i += 2\n            else:\n                turns.append(part)\n                i += 1\n        else:\n            i += 1\n\n    return turns\n\n\ndef group_turns_into_batches(\n    turns: list[str], max_speakers: int = 3, max_bytes: int = 300\n) -> list[str]:\n    \"\"\"\n    Group turns into batches based on speaker count or byte limit.\n\n    Args:\n        turns: List of speaker turns\n        max_speakers: Maximum number of speakers per batch (default 3)\n        max_bytes: Maximum UTF-8 bytes per batch (default 300)\n\n    Returns:\n        List of batched text strings\n    \"\"\"\n    batches = []\n    current_batch = []\n    current_bytes = 0\n\n    for turn in turns:\n        turn_bytes = len(turn.encode(\"utf-8\"))\n\n        would_exceed_speakers = len(current_batch) >= max_speakers\n        would_exceed_bytes = current_bytes + turn_bytes > max_bytes and current_batch\n\n        if would_exceed_speakers or would_exceed_bytes:\n            batches.append(\"\\n\".join(current_batch))\n            current_batch = [turn]\n            current_bytes = turn_bytes\n        else:\n            current_batch.append(turn)\n            current_bytes += turn_bytes\n\n    if current_batch:\n        batches.append(\"\\n\".join(current_batch))\n\n    return batches\n\n\ndef generate_long(\n    *,\n    model,\n    device: Union[str, torch.device],\n    decode_one_token: Callable,\n    text: str,\n    num_samples: int = 1,\n    max_new_tokens: int = 0,\n    top_p: float = 0.9,\n    top_k: int = 30,\n    repetition_penalty: float = 1.1,\n    temperature: float = 1.0,\n    compile: bool = False,\n    iterative_prompt: bool = True,\n    chunk_length: int = 512,\n    prompt_text: Optional[Union[str, list[str]]] = None,\n    prompt_tokens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,\n):\n    assert 0 < top_p <= 1, \"top_p must be in (0, 1]\"\n    assert 0 < temperature < 2, \"temperature must be in (0, 2)\"\n\n    use_prompt = bool(prompt_text) and bool(prompt_tokens)\n    if use_prompt and isinstance(prompt_text, str):\n        prompt_text = [prompt_text]\n        prompt_tokens = [prompt_tokens]\n\n    if use_prompt:\n        assert len(prompt_text) == len(\n            prompt_tokens\n        ), \"Prompt text and tokens must have the same length\"\n\n    if prompt_tokens:\n        prompt_tokens = [i.cpu() for i in prompt_tokens]\n\n    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)\n    tokenizer = model.tokenizer\n    max_length = model.config.max_seq_len\n\n    # Build base conversation with system message\n    base_conversation = Conversation()\n\n    if use_prompt:\n        # Auto-add speaker tags to prompt texts that don't have them\n        tagged_prompt_text = []\n        for i, t in enumerate(prompt_text):\n            if not re.search(r\"<\\|speaker:\\d+\\|>\", t):\n                tagged_prompt_text.append(f\"<|speaker:{i}|>{t}\")\n            else:\n                tagged_prompt_text.append(t)\n\n        system_parts = [\n            TextPart(\n                text=\"convert the provided text to speech reference to the following:\\n\\nText:\\n\",\n                cal_loss=False,\n            ),\n        ]\n        reference_text = \"\\n\".join(tagged_prompt_text)\n        system_parts.append(TextPart(text=reference_text, cal_loss=False))\n        system_parts.append(TextPart(text=\"\\n\\nSpeech:\\n\", cal_loss=False))\n        all_codes = torch.cat([c for c in prompt_tokens], dim=1)\n        system_parts.append(VQPart(codes=all_codes, cal_loss=False))\n        # torch.save(all_codes, \"debug_vq_codes.pt\")\n    else:\n        system_parts = [\n            TextPart(text=\"convert the provided text to speech\", cal_loss=False)\n        ]\n\n    base_conversation.append(\n        Message(\n            role=\"system\",\n            parts=system_parts,\n            cal_loss=False,\n            add_im_start=True,\n            add_im_end=True,\n        )\n    )\n\n    # Split text by speaker and group into batches\n    turns = split_text_by_speaker(text)\n    if turns:\n        batches = group_turns_into_batches(\n            turns, max_speakers=5, max_bytes=chunk_length\n        )\n    else:\n        batches = [text]\n\n    logger.info(f\"Split into {len(turns)} turns, grouped into {len(batches)} batches\")\n\n    for sample_idx in range(num_samples):\n        if torch.cuda.is_available():\n            torch.cuda.synchronize()\n\n        t0 = time.perf_counter()\n\n        # Deep copy base conversation for this sample\n        conversation = deepcopy(base_conversation)\n\n        for batch_idx, batch_text in enumerate(batches):\n            logger.info(\n                f\"--- Sample {sample_idx}, Batch {batch_idx} \"\n                f\"({len(batch_text.encode('utf-8'))} bytes) ---\"\n            )\n            logger.info(f\"Batch text: {batch_text}\")\n\n            # Add user message\n            conversation.append(\n                Message(\n                    role=\"user\",\n                    parts=[TextPart(text=batch_text, cal_loss=False)],\n                    cal_loss=False,\n                    add_im_start=True,\n                    add_im_end=True,\n                )\n            )\n\n            # Deep copy for generation (don't pollute original conversation)\n            conversation_gen = deepcopy(conversation)\n            conversation_gen.append(\n                Message(\n                    role=\"assistant\",\n                    parts=[],\n                    cal_loss=False,\n                    modality=\"voice\",\n                    add_im_start=True,\n                    add_im_end=False,\n                )\n            )\n\n            logger.info(\"Visualizing prompt structure:\")\n            conversation_gen.visualize(\n                tokenizer,\n                merge_audio_tokens=True,\n                merge_semantic_tokens=True,\n            )\n\n            encoded, audio_masks, audio_parts = conversation_gen.encode_for_inference(\n                tokenizer, num_codebooks=model.config.num_codebooks\n            )\n\n            logger.info(f\"Encoded prompt shape: {encoded.shape}\")\n            if audio_parts is not None:\n                logger.info(f\"Audio parts shape: {audio_parts.shape}\")\n            if audio_masks is not None:\n                logger.info(\n                    f\"Audio masks non-zero count: {torch.count_nonzero(audio_masks)}\"\n                )\n\n            if encoded.size(1) > max_length - 2048:\n                raise ValueError(\n                    f\"Prompt is too long: {encoded.size(1)} > {max_length - 2048}\"\n                )\n\n            encoded = encoded.to(device=device)\n            prompt_length = encoded.size(1)\n\n            y = generate(\n                model=model,\n                prompt=encoded,\n                max_new_tokens=max_new_tokens,\n                audio_masks=audio_masks,\n                audio_parts=audio_parts,\n                decode_one_token=decode_one_token,\n                temperature=temperature,\n                top_p=top_p,\n                top_k=top_k,\n            )\n\n            if sample_idx == 0 and batch_idx == 0 and compile:\n                logger.info(f\"Compilation time: {time.perf_counter() - t0:.2f} seconds\")\n\n            if torch.cuda.is_available():\n                torch.cuda.synchronize()\n\n            t_batch = time.perf_counter() - t0\n            tokens_generated = y.size(1) - prompt_length\n            tokens_sec = tokens_generated / t_batch if t_batch > 0 else 0\n            logger.info(\n                f\"Batch {batch_idx}: Generated {tokens_generated} tokens in \"\n                f\"{t_batch:.02f} seconds, {tokens_sec:.02f} tokens/sec\"\n            )\n            logger.info(\n                f\"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s\"\n            )\n\n            # Extract generated codes\n            codes = y[1:, prompt_length:-1].clone()\n            assert (codes >= 0).all(), f\"Negative code found: {codes}\"\n\n            # Add assistant message with generated codes back to conversation\n            conversation.append(\n                Message(\n                    role=\"assistant\",\n                    parts=[VQPart(codes=codes.cpu(), cal_loss=False)],\n                    cal_loss=False,\n                    modality=\"voice\",\n                    add_im_start=True,\n                    add_im_end=True,\n                )\n            )\n\n            yield GenerateResponse(action=\"sample\", codes=codes, text=batch_text)\n\n            # Cleanup\n            del y, encoded\n\n        if torch.cuda.is_available():\n            logger.info(\n                f\"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB\"\n            )\n\n        yield GenerateResponse(action=\"next\")\n\n\n@dataclass\nclass WrappedGenerateResponse:\n    status: Literal[\"success\", \"error\"]\n    response: Optional[Union[GenerateResponse, Exception]] = None\n\n\n@dataclass\nclass GenerateRequest:\n    request: dict\n    response_queue: queue.Queue\n\n\ndef launch_thread_safe_queue(\n    checkpoint_path,\n    device,\n    precision,\n    compile: bool = False,\n):\n    input_queue = queue.Queue()\n    init_event = threading.Event()\n\n    def worker():\n        model, decode_one_token = init_model(\n            checkpoint_path, device, precision, compile=compile\n        )\n        with torch.device(device):\n            model.setup_caches(\n                max_batch_size=1,\n                max_seq_len=model.config.max_seq_len,\n                dtype=next(model.parameters()).dtype,\n            )\n        init_event.set()\n\n        while True:\n            item: GenerateRequest | None = input_queue.get()\n            if item is None:\n                break\n\n            kwargs = item.request\n            response_queue = item.response_queue\n\n            try:\n                for chunk in generate_long(\n                    model=model, decode_one_token=decode_one_token, **kwargs\n                ):\n                    response_queue.put(\n                        WrappedGenerateResponse(status=\"success\", response=chunk)\n                    )\n\n                # Only clear cache after complete request batch\n                if torch.cuda.is_available():\n                    torch.cuda.empty_cache()\n\n            except Exception as e:\n                logger.error(traceback.format_exc())\n                response_queue.put(WrappedGenerateResponse(status=\"error\", response=e))\n                # Clear cache on error\n                if torch.cuda.is_available():\n                    torch.cuda.empty_cache()\n\n    threading.Thread(target=worker, daemon=True).start()\n    init_event.wait()\n\n    return input_queue\n\n\n@click.command()\n@click.option(\n    \"--text\",\n    type=str,\n    default=\"<|speaker:0|>你说的对, 但是原神是一款由米哈游自主研发的开放世界手游.\",\n)\n@click.option(\"--prompt-text\", type=str, default=None, multiple=True)\n@click.option(\n    \"--prompt-tokens\",\n    type=click.Path(path_type=Path, exists=True),\n    default=None,\n    multiple=True,\n)\n@click.option(\n    \"--prompt-audio\",\n    type=click.Path(path_type=Path, exists=True),\n    default=None,\n    multiple=True,\n)\n@click.option(\"--output\", type=click.Path(path_type=Path), default=None)\n@click.option(\"--num-samples\", type=int, default=1)\n@click.option(\"--max-new-tokens\", type=int, default=0)\n@click.option(\"--top-p\", type=float, default=0.9)\n@click.option(\"--top-k\", type=int, default=30)\n@click.option(\"--temperature\", type=float, default=1.0)\n@click.option(\n    \"--checkpoint-path\",\n    type=click.Path(path_type=Path, exists=True),\n    default=\"checkpoints/s2-pro\",\n)\n@click.option(\"--device\", type=str, default=\"cuda\")\n@click.option(\"--compile/--no-compile\", default=False)\n@click.option(\"--seed\", type=int, default=42)\n@click.option(\"--half/--no-half\", default=False)\n@click.option(\"--iterative-prompt/--no-iterative-prompt\", default=True)\n@click.option(\"--chunk-length\", type=int, default=300)\n@click.option(\"--output-dir\", type=Path, default=\"output\")\ndef main(\n    text: str,\n    prompt_text: Optional[tuple[str, ...]],\n    prompt_tokens: Optional[tuple[Path, ...]],\n    prompt_audio: Optional[tuple[Path, ...]],\n    output: Optional[Path],\n    num_samples: int,\n    max_new_tokens: int,\n    top_p: float,\n    top_k: int,\n    temperature: float,\n    checkpoint_path: Path,\n    device: str,\n    compile: bool,\n    seed: int,\n    half: bool,\n    iterative_prompt: bool,\n    chunk_length: int,\n    output_dir: Path,\n) -> None:\n    os.makedirs(output_dir, exist_ok=True)\n    precision = torch.half if half else torch.bfloat16\n\n    if prompt_text and not prompt_audio and not prompt_tokens:\n        raise ValueError(\n            \"--prompt-text requires either --prompt-audio or --prompt-tokens\"\n        )\n    if prompt_text and prompt_tokens and len(prompt_text) != len(prompt_tokens):\n        raise ValueError(\n            f\"Number of prompt text ({len(prompt_text)}) and prompt tokens ({len(prompt_tokens)}) should be the same\"\n        )\n    if prompt_text and prompt_audio and len(prompt_text) != len(prompt_audio):\n        raise ValueError(\n            f\"Number of prompt text ({len(prompt_text)}) and prompt audio ({len(prompt_audio)}) should be the same\"\n        )\n\n    logger.info(\"Loading model ...\")\n    t0 = time.time()\n    model, decode_one_token = init_model(\n        checkpoint_path, device, precision, compile=compile\n    )\n    with torch.device(device):\n        model.setup_caches(\n            max_batch_size=1,\n            max_seq_len=model.config.max_seq_len,\n            dtype=next(model.parameters()).dtype,\n        )\n    if torch.cuda.is_available():\n        torch.cuda.synchronize()\n\n    logger.info(f\"Time to load model: {time.time() - t0:.02f} seconds\")\n\n    codec = None\n    codec_checkpoint = checkpoint_path / \"codec.pth\"\n\n    # Handle prompt: --prompt-audio takes priority over --prompt-tokens\n    prompt_tokens_list = None\n    if prompt_audio:\n        logger.info(\"Loading codec model for audio encoding...\")\n        codec = load_codec_model(codec_checkpoint, device, precision)\n        prompt_tokens_list = [\n            encode_audio(p, codec, device).cpu() for p in prompt_audio\n        ]\n        logger.info(f\"Encoded {len(prompt_audio)} audio file(s) to VQ codes\")\n    elif prompt_tokens is not None:\n        prompt_tokens_list = [torch.from_numpy(np.load(p)) for p in prompt_tokens]\n\n    torch.manual_seed(seed)\n\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed(seed)\n\n    generator = generate_long(\n        model=model,\n        device=device,\n        decode_one_token=decode_one_token,\n        text=text,\n        num_samples=num_samples,\n        max_new_tokens=max_new_tokens,\n        top_p=top_p,\n        top_k=top_k,\n        temperature=temperature,\n        compile=compile,\n        iterative_prompt=iterative_prompt,\n        chunk_length=chunk_length,\n        prompt_text=list(prompt_text) if prompt_text else None,\n        prompt_tokens=prompt_tokens_list,\n    )\n\n    idx = 0\n    codes = []\n\n    for response in generator:\n        if response.action == \"sample\":\n            codes.append(response.codes)\n            logger.info(f\"Sampled text: {response.text}\")\n        elif response.action == \"next\":\n            if codes:\n                merged_codes = torch.cat(codes, dim=1)\n                codes_npy_path = os.path.join(output_dir, f\"codes_{idx}.npy\")\n                np.save(codes_npy_path, merged_codes.cpu().numpy())\n                logger.info(f\"Saved codes to {codes_npy_path}\")\n\n                # Decode to wav if --output is specified\n                if output:\n                    if codec is None:\n                        logger.info(\"Loading codec model for audio decoding...\")\n                        codec = load_codec_model(codec_checkpoint, device, precision)\n                    audio = decode_to_audio(merged_codes.to(device), codec)\n                    import soundfile as sf\n\n                    out_path = (\n                        str(output)\n                        if num_samples == 1\n                        else str(output.with_stem(f\"{output.stem}_{idx}\"))\n                    )\n                    sf.write(out_path, audio.cpu().float().numpy(), codec.sample_rate)\n                    logger.info(f\"Saved audio to {out_path}\")\n\n            logger.info(f\"Next sample\")\n            codes = []\n            idx += 1\n        else:\n            logger.error(f\"Error: {response}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "fish_speech/models/text2semantic/lit_module.py",
    "content": "from typing import Any, Optional\n\nimport lightning as L\nimport torch\nimport torch.nn.functional as F\nfrom lightning.pytorch.utilities.types import OptimizerLRScheduler\n\nimport fish_speech.utils as utils\n\nCODEBOOK_PAD_TOKEN_ID = 0\nfrom fish_speech.models.text2semantic.llama import NaiveTransformer\n\nlog = utils.RankedLogger(__name__, rank_zero_only=True)\n\n\nclass TextToSemantic(L.LightningModule):\n    def __init__(\n        self,\n        model: NaiveTransformer,\n        optimizer: Any,\n        lr_scheduler: Any,\n    ):\n        super().__init__()\n\n        self.model = model\n        self.optimizer_builder = optimizer\n        self.lr_scheduler_builder = lr_scheduler\n\n    def forward(self, x):\n        return self.model(x)\n\n    def on_save_checkpoint(self, checkpoint):\n        # Save only LoRA parameters\n        state_dict = checkpoint[\"state_dict\"]\n        use_lora = any(\"lora\" in name for name in state_dict.keys())\n        if not use_lora:\n            return\n\n        for name in list(state_dict.keys()):\n            if \"lora\" not in name:\n                state_dict.pop(name)\n\n    def configure_optimizers(self) -> OptimizerLRScheduler:\n        # Get weight decay parameters\n        weight_decay_parameters, other_parameters = [], []\n        for name, param in self.named_parameters():\n            if \".bias\" in name or \"norm.weight\" in name or \".embeddings.\" in name:\n                other_parameters.append(param)\n            else:\n                weight_decay_parameters.append(param)\n\n        optimizer = self.optimizer_builder(\n            [\n                {\"params\": weight_decay_parameters},\n                {\"params\": other_parameters, \"weight_decay\": 0.0},\n            ]\n        )\n\n        # Print the parameters and their weight decay\n        for i in optimizer.param_groups:\n            log.info(\n                f\"Set weight decay: {i['weight_decay']} for {len(i['params'])} parameters\"\n            )\n\n        lr_scheduler = self.lr_scheduler_builder(optimizer)\n\n        return {\n            \"optimizer\": optimizer,\n            \"lr_scheduler\": {\n                \"scheduler\": lr_scheduler,\n                \"interval\": \"step\",\n            },\n        }\n\n    # Copied from https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90\n    def get_batch_logps(\n        self,\n        logits: torch.FloatTensor,\n        labels: torch.LongTensor,\n        average_log_prob: bool = False,\n    ) -> torch.FloatTensor:\n        \"\"\"Compute the log probabilities of the given labels under the given logits.\n\n        Args:\n            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, codebook_size, vocab_size)\n            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length, codebook_size)\n            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.\n\n        Returns:\n            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.\n        \"\"\"\n        assert logits.shape[:-1] == labels.shape\n\n        labels = labels.clone()\n        loss_mask = labels != -100\n\n        # dummy token; we'll ignore the losses on these tokens later\n        labels[labels == -100] = 0\n\n        per_token_logps = torch.gather(\n            logits.log_softmax(-1), dim=-1, index=labels.unsqueeze(-1)\n        ).squeeze(-1)\n\n        if average_log_prob:\n            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)\n        else:\n            return (per_token_logps * loss_mask).sum(-1)\n\n    def _step(self, batch, batch_idx, stage: str):\n        is_train = stage == \"train\"\n\n        if is_train:\n            # Key part to make lora work\n            # Otherwise the parameters are merged, which lead to incorrect gradients\n            self.model.train()\n\n        # Do positive and negative samples in the same batch to speed up training\n        labels = batch[\"labels\"]\n        outputs = self.model(\n            inp=batch[\"inputs\"],\n            key_padding_mask=batch[\"attention_masks\"],\n            labels=batch[\"labels\"],\n        )\n        token_logits = outputs.token_logits\n        codebook_logits = outputs.codebook_logits\n\n        # Generate labels\n        base_loss = F.cross_entropy(\n            token_logits.view(-1, token_logits.size(-1)),\n            labels[:, 0].reshape(-1),\n            ignore_index=-100,\n        )\n\n        token_ids = labels[:, 0]\n        semantic_mask = (token_ids >= self.model.tokenizer.semantic_begin_id) & (\n            token_ids <= self.model.tokenizer.semantic_end_id\n        )\n        all_codebook_labels = labels[:, 1 : 1 + self.model.config.num_codebooks]\n        all_codebook_labels_permuted = all_codebook_labels.permute(0, 2, 1)\n        filtered_codebook_labels = all_codebook_labels_permuted[semantic_mask]\n        semantic_loss = F.cross_entropy(\n            codebook_logits.reshape(-1, codebook_logits.size(-1)),\n            filtered_codebook_labels.reshape(-1),\n            ignore_index=-100,\n        )\n\n        loss = base_loss + semantic_loss\n\n        self.log(\n            f\"{stage}/loss\",\n            loss,\n            on_step=is_train,\n            on_epoch=not is_train,\n            prog_bar=True,\n            logger=True,\n            sync_dist=not is_train,\n        )\n\n        self.log(\n            f\"{stage}/base_loss\",\n            base_loss,\n            on_step=is_train,\n            on_epoch=not is_train,\n            prog_bar=False,\n            logger=True,\n            sync_dist=not is_train,\n        )\n\n        self.log(\n            f\"{stage}/semantic_loss\",\n            semantic_loss,\n            on_step=is_train,\n            on_epoch=not is_train,\n            prog_bar=False,\n            logger=True,\n            sync_dist=not is_train,\n        )\n\n        # Top-5 accuracy\n        accuracy = self.get_accuracy(codebook_logits, filtered_codebook_labels)\n        self.log(\n            f\"{stage}/top_5_accuracy\",\n            accuracy,\n            on_step=is_train,\n            on_epoch=not is_train,\n            prog_bar=True,\n            logger=True,\n            sync_dist=not is_train,\n        )\n\n        return loss\n\n    def get_accuracy(self, logits, labels):\n        mask = (labels != -100) & (labels != CODEBOOK_PAD_TOKEN_ID)\n        if mask.sum() == 0:\n            return torch.tensor(0.0, device=logits.device)\n\n        _, indices = logits.topk(5, dim=-1)\n        correct = indices.eq(labels.unsqueeze(-1))\n        correct[~mask] = 0\n        correct = correct.sum()\n        accuracy = correct / mask.sum()\n\n        return accuracy\n\n    def training_step(self, batch, batch_idx):\n        return self._step(batch, batch_idx, \"train\")\n\n    def validation_step(self, batch, batch_idx):\n        return self._step(batch, batch_idx, \"val\")\n"
  },
  {
    "path": "fish_speech/models/text2semantic/llama.py",
    "content": "import dataclasses\nimport json\nimport math\nfrom collections import OrderedDict\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Optional\n\nimport torch\nimport torch.nn as nn\nfrom einops import rearrange\nfrom loguru import logger\nfrom torch import Tensor\nfrom torch.nn import functional as F\nfrom torch.nn.attention import SDPBackend, sdpa_kernel\nfrom torch.utils.checkpoint import checkpoint\n\nfrom fish_speech.models.text2semantic.lora import LoraConfig, setup_lora\n\n\ndef find_multiple(n: int, k: int) -> int:\n    if n % k == 0:\n        return n\n    return n + k - (n % k)\n\n\n@dataclass\nclass BaseModelArgs:\n    model_type: str = \"base\"\n\n    vocab_size: int = 32000\n    n_layer: int = 32\n    n_head: int = 32\n    dim: int = 4096\n    intermediate_size: int = None\n    n_local_heads: int = -1\n    head_dim: int = 64\n    rope_base: float = 10000\n    norm_eps: float = 1e-5\n    max_seq_len: int = 2048\n    dropout: float = 0.0\n    tie_word_embeddings: bool = True\n    attention_qkv_bias: bool = False\n    attention_o_bias: bool = False\n    attention_qk_norm: bool = False\n\n    # Codebook configs\n    codebook_size: int = 160\n    num_codebooks: int = 4\n\n    semantic_begin_id: int = 0\n    semantic_end_id: int = 0\n\n    # Gradient checkpointing\n    use_gradient_checkpointing: bool = True\n\n    # Initialize the model\n    initializer_range: float = 0.02\n\n    # Dummy vars\n    is_reward_model: bool = False\n    scale_codebook_embeddings: bool = False\n    audio_embed_dim: Optional[int] = None\n\n    def __post_init__(self):\n        if self.n_local_heads == -1:\n            self.n_local_heads = self.n_head\n        if self.intermediate_size is None:\n            hidden_dim = 4 * self.dim\n            n_hidden = int(2 * hidden_dim / 3)\n            self.intermediate_size = find_multiple(n_hidden, 256)\n        if self.head_dim is None:\n            self.head_dim = self.dim // self.n_head\n\n    @staticmethod\n    def from_pretrained(path: str):\n        path = Path(path)\n\n        if path.is_dir():\n            path = path / \"config.json\"\n\n        with open(path, \"r\", encoding=\"utf-8\") as f:\n            data = json.load(f)\n\n        match data[\"model_type\"]:\n            case \"naive\":\n                cls = NaiveModelArgs\n            case \"dual_ar\":\n                cls = DualARModelArgs\n            case \"fish_qwen3_omni\":\n                return BaseModelArgs._from_fish_qwen3_omni(data)\n            case _:\n                raise ValueError(f\"Unknown model type: {data['model_type']}\")\n\n        # Filter out unexpected keyword arguments\n        valid_keys = {f.name for f in dataclasses.fields(cls)}\n        data = {k: v for k, v in data.items() if k in valid_keys}\n\n        return cls(**data)\n\n    @staticmethod\n    def _from_fish_qwen3_omni(data: dict) -> \"DualARModelArgs\":\n        tc = data[\"text_config\"]\n        adc = data[\"audio_decoder_config\"]\n        flat = dict(\n            model_type=\"dual_ar\",\n            vocab_size=tc[\"vocab_size\"],\n            n_layer=tc[\"n_layer\"],\n            n_head=tc[\"n_head\"],\n            n_local_heads=tc.get(\"n_local_heads\", -1),\n            head_dim=tc.get(\"head_dim\"),\n            dim=tc[\"dim\"],\n            intermediate_size=tc.get(\"intermediate_size\"),\n            rope_base=tc.get(\"rope_base\", 10000),\n            norm_eps=tc.get(\"norm_eps\", 1e-5),\n            max_seq_len=tc.get(\"max_seq_len\", 2048),\n            dropout=tc.get(\"dropout\", 0.0),\n            tie_word_embeddings=tc.get(\"tie_word_embeddings\", True),\n            attention_qkv_bias=tc.get(\"attention_qkv_bias\", False),\n            attention_o_bias=tc.get(\"attention_o_bias\", False),\n            attention_qk_norm=tc.get(\"attention_qk_norm\", False),\n            use_gradient_checkpointing=tc.get(\"use_gradient_checkpointing\", True),\n            initializer_range=tc.get(\"initializer_range\", 0.02),\n            semantic_begin_id=data.get(\"semantic_start_token_id\", 0),\n            semantic_end_id=data.get(\"semantic_end_token_id\", 0),\n            scale_codebook_embeddings=True,\n            norm_fastlayer_input=True,\n            audio_embed_dim=adc.get(\"text_dim\", tc[\"dim\"]),\n            codebook_size=adc[\"vocab_size\"],\n            num_codebooks=adc[\"num_codebooks\"],\n            n_fast_layer=adc[\"n_layer\"],\n            fast_dim=adc.get(\"dim\"),\n            fast_n_head=adc.get(\"n_head\"),\n            fast_n_local_heads=adc.get(\"n_local_heads\"),\n            fast_head_dim=adc.get(\"head_dim\"),\n            fast_intermediate_size=adc.get(\"intermediate_size\"),\n            fast_attention_qkv_bias=adc.get(\"attention_qkv_bias\"),\n            fast_attention_qk_norm=adc.get(\"attention_qk_norm\"),\n            fast_attention_o_bias=adc.get(\"attention_o_bias\"),\n        )\n        valid_keys = {f.name for f in dataclasses.fields(DualARModelArgs)}\n        flat = {k: v for k, v in flat.items() if k in valid_keys and v is not None}\n        return DualARModelArgs(**flat)\n\n    def save(self, path: str):\n        with open(path, \"w\") as f:\n            json.dump(self.__dict__, f, indent=4, sort_keys=True, ensure_ascii=False)\n\n\n@dataclass\nclass NaiveModelArgs(BaseModelArgs):\n    model_type: str = \"naive\"\n\n\n@dataclass\nclass DualARModelArgs(BaseModelArgs):\n    model_type: str = \"dual_ar\"\n    n_fast_layer: int = 4\n    fast_dim: int | None = None\n    fast_n_head: int | None = None\n    fast_n_local_heads: int | None = None\n    fast_head_dim: int | None = None\n    fast_intermediate_size: int | None = None\n    fast_attention_qkv_bias: bool | None = None\n    fast_attention_qk_norm: bool | None = None\n    fast_attention_o_bias: bool | None = None\n    norm_fastlayer_input: bool = False\n\n    def __post_init__(self):\n        super().__post_init__()\n\n        self.fast_dim = self.fast_dim or self.dim\n        self.fast_n_head = self.fast_n_head or self.n_head\n        self.fast_n_local_heads = self.fast_n_local_heads or self.n_local_heads\n        self.fast_head_dim = self.fast_head_dim or self.head_dim\n        self.fast_intermediate_size = (\n            self.fast_intermediate_size or self.intermediate_size\n        )\n        self.fast_attention_qkv_bias = (\n            self.fast_attention_qkv_bias\n            if self.fast_attention_qkv_bias is not None\n            else self.attention_qkv_bias\n        )\n        self.fast_attention_qk_norm = (\n            self.fast_attention_qk_norm\n            if self.fast_attention_qk_norm is not None\n            else self.attention_qk_norm\n        )\n        self.fast_attention_o_bias = (\n            self.fast_attention_o_bias\n            if self.fast_attention_o_bias is not None\n            else self.attention_o_bias\n        )\n\n\nclass KVCache(nn.Module):\n    def __init__(\n        self, max_batch_size, max_seq_len, n_heads, head_dim, dtype=torch.bfloat16\n    ):\n        super().__init__()\n        cache_shape = (max_batch_size, n_heads, max_seq_len, head_dim)\n        self.register_buffer(\"k_cache\", torch.zeros(cache_shape, dtype=dtype))\n        self.register_buffer(\"v_cache\", torch.zeros(cache_shape, dtype=dtype))\n\n    def update(self, input_pos, k_val, v_val):\n        # input_pos: [S], k_val: [B, H, S, D]\n        assert input_pos.shape[0] == k_val.shape[2]\n\n        k_out = self.k_cache\n        v_out = self.v_cache\n        k_out[:, :, input_pos] = k_val\n        v_out[:, :, input_pos] = v_val\n\n        return k_out, v_out\n\n\n@dataclass\nclass TransformerForwardResult:\n    token_logits: Tensor\n    codebook_logits: Tensor\n\n\n@dataclass\nclass BaseTransformerForwardResult:\n    logits: Tensor\n    hidden_states: Tensor\n\n\ndef _remap_fish_qwen3_omni_keys(weights: OrderedDict) -> OrderedDict:\n    if not any(k.startswith((\"text_model.\", \"audio_decoder.\")) for k in weights):\n        return weights\n    new_weights = OrderedDict()\n    for k, v in weights.items():\n        if k.startswith(\"text_model.model.\"):\n            new_key = k[len(\"text_model.model.\") :]\n        elif k.startswith(\"audio_decoder.\"):\n            suffix = k[len(\"audio_decoder.\") :]\n            new_key = (\n                suffix\n                if suffix.startswith(\"codebook_embeddings.\")\n                else \"fast_\" + suffix\n            )\n        else:\n            new_key = k\n        new_weights[new_key] = v\n    return new_weights\n\n\nclass BaseTransformer(nn.Module):\n    def __init__(\n        self,\n        config: BaseModelArgs,\n        init_weights: bool = True,\n    ) -> None:\n        super().__init__()\n        self.config = config\n\n        # Slow transformer\n        self.embeddings = nn.Embedding(\n            config.vocab_size,\n            config.dim,\n        )\n        self.codebook_embeddings = nn.Embedding(\n            config.codebook_size * config.num_codebooks,\n            config.dim,\n        )\n        self.layers = nn.ModuleList(\n            TransformerBlock(config, use_sdpa=True) for _ in range(config.n_layer)\n        )\n        self.norm = RMSNorm(config.dim, eps=config.norm_eps)\n\n        if self.config.tie_word_embeddings is False:\n            self.output = nn.Linear(\n                config.dim,\n                config.vocab_size,\n                bias=False,\n            )\n\n        self.register_buffer(\n            \"freqs_cis\",\n            precompute_freqs_cis(\n                config.max_seq_len,\n                config.head_dim,\n                config.rope_base,\n            ),\n            persistent=False,\n        )\n        self.register_buffer(\n            \"causal_mask\",\n            torch.tril(\n                torch.ones(\n                    config.max_seq_len,\n                    config.max_seq_len,\n                    dtype=torch.bool,\n                )\n            ),\n            persistent=False,\n        )\n\n        # For kv cache\n        self.max_batch_size = -1\n        self.max_seq_len = -1\n\n        if init_weights:\n            self.apply(self._init_weights)\n\n    def setup_caches(\n        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16\n    ):\n        if self.max_seq_len >= max_seq_len and self.max_batch_size >= max_batch_size:\n            return\n\n        max_seq_len = find_multiple(max_seq_len, 8)\n        self.max_seq_len = max_seq_len\n        self.max_batch_size = max_batch_size\n\n        for b in self.layers:\n            b.attention.kv_cache = KVCache(\n                max_batch_size,\n                max_seq_len,\n                self.config.n_local_heads,\n                self.config.head_dim,\n                dtype=dtype,\n            )\n\n    def embed(self, inp: Tensor) -> Tensor:\n        embeds = []\n\n        for i in range(self.config.num_codebooks):\n            emb = self.codebook_embeddings(\n                inp[:, i + 1] + i * self.config.codebook_size\n            )\n            embeds.append(emb)\n\n        vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)\n\n        is_semantic = (inp[:, 0] >= self.config.semantic_begin_id) & (\n            inp[:, 0] <= self.config.semantic_end_id\n        )\n\n        vq_embeds_sum[~is_semantic] = 0\n\n        x = self.embeddings(inp[:, 0]) + vq_embeds_sum\n\n        return x\n\n    def forward(\n        self,\n        inp: Tensor,\n        key_padding_mask: Optional[Tensor] = None,\n    ) -> BaseTransformerForwardResult:\n        seq_len = inp.size(2)\n\n        # Here we want to merge the embeddings of the codebooks\n        x = self.embed(inp)\n\n        freqs_cis = self.freqs_cis[:seq_len]\n\n        mask = None\n        if key_padding_mask is not None:\n            causal = self.causal_mask[:seq_len, :seq_len]\n            causal = rearrange(causal, \"q k -> 1 1 q k\")\n\n            atten_mask = rearrange(key_padding_mask, \"b s -> b 1 1 s\")\n            atten_mask = atten_mask.logical_not()\n            mask = causal & atten_mask\n\n        for layer in self.layers:\n            if self.config.use_gradient_checkpointing and self.training:\n                x = checkpoint(layer, x, freqs_cis, mask, use_reentrant=True)\n            else:\n                x = layer(x, freqs_cis, mask)\n\n        slow_out = self.norm(x)\n\n        if self.config.tie_word_embeddings:\n            token_logits = F.linear(slow_out, self.embeddings.weight)\n        else:\n            token_logits = self.output(slow_out)\n\n        hidden_out = (\n            slow_out if getattr(self.config, \"norm_fastlayer_input\", False) else x\n        )\n\n        return BaseTransformerForwardResult(\n            logits=token_logits,\n            hidden_states=hidden_out,\n        )\n\n    def forward_generate(\n        self,\n        inp: Tensor,\n        input_pos: Optional[Tensor] = None,\n        audio_masks: Optional[Tensor] = None,\n        audio_parts: Optional[Tensor] = None,\n        return_all: bool = False,\n    ) -> BaseTransformerForwardResult:\n\n        # Embedding logic replicated from embed() for compilation compatibility\n        embeds = []\n        for i in range(self.config.num_codebooks):\n            emb = self.codebook_embeddings(\n                inp[:, i + 1] + i * self.config.codebook_size\n            )\n            embeds.append(emb)\n\n        vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)\n\n        vq_masks = (inp[:, 0] >= self.config.semantic_begin_id) & (\n            inp[:, 0] <= self.config.semantic_end_id\n        )\n\n        vq_embeds_sum[~vq_masks] = 0\n        x = self.embeddings(inp[:, 0]) + vq_embeds_sum\n\n        if self.config.scale_codebook_embeddings:\n            vq_masks_expanded = vq_masks.unsqueeze(-1).expand_as(x)\n            x = torch.where(\n                vq_masks_expanded, x / math.sqrt(self.config.num_codebooks + 1), x\n            )\n\n        # Audio embeddings\n        if audio_parts is not None:\n            # Note: This assumes self.audio_projector exists if audio_parts is used\n            # It seems missing in init, but we keep existing logic\n            if hasattr(self, \"audio_projector\"):\n                audio_embeds = self.audio_projector(audio_parts)\n                if self.config.scale_codebook_embeddings:\n                    x[audio_masks] = audio_embeds / math.sqrt(2)\n                else:\n                    x[audio_masks] = audio_embeds\n            else:\n                logger.warning(\"audio_parts provided but model has no audio_projector\")\n\n        if input_pos is None:\n            input_pos = torch.arange(inp.shape[-1], device=x.device)\n            max_seq_len = inp.shape[-1]\n        else:\n            max_seq_len = self.max_seq_len\n\n        mask = self.causal_mask[None, None, input_pos, :max_seq_len]  # (B, N, Q, K)\n        freqs_cis = self.freqs_cis[input_pos]\n\n        for layer in self.layers:\n            x = layer(x, freqs_cis, mask, input_pos=input_pos)\n\n        if x.size(1) > 1 and not return_all:\n            x = x[:, -1:]\n\n        slow_out = self.norm(x)\n\n        if self.config.is_reward_model:\n            token_logits = self.score_output(slow_out)\n        elif self.config.tie_word_embeddings:\n            token_logits = F.linear(slow_out, self.embeddings.weight)\n        else:\n            token_logits = self.output(slow_out)\n\n        hidden_out = (\n            slow_out if getattr(self.config, \"norm_fastlayer_input\", False) else x\n        )\n\n        return BaseTransformerForwardResult(\n            logits=token_logits,\n            hidden_states=hidden_out,\n        )\n\n    def _init_weights(self, module):\n        std = self.config.initializer_range\n        if isinstance(module, nn.Linear):\n            module.weight.data.normal_(mean=0.0, std=std)\n            if module.bias is not None:\n                module.bias.data.zero_()\n        elif isinstance(module, nn.Embedding):\n            module.weight.data.normal_(mean=0.0, std=std)\n            if module.padding_idx is not None:\n                module.weight.data[module.padding_idx].zero_()\n\n    @staticmethod\n    def from_pretrained(\n        path: str,\n        load_weights: bool = False,\n        max_length: int | None = None,\n        lora_config: LoraConfig | None = None,\n        rope_base: int | None = None,\n    ) -> \"BaseTransformer\":\n        # Import wrapper locally to avoid circular dependency or global import issues\n        from fish_speech.tokenizer import FishTokenizer\n\n        config = BaseModelArgs.from_pretrained(str(path))\n        if max_length is not None:\n            config.max_seq_len = max_length\n            logger.info(f\"Override max_seq_len to {max_length}\")\n\n        if rope_base is not None:\n            config.rope_base = rope_base\n            logger.info(f\"Override rope_base to {rope_base}\")\n\n        try:\n            tokenizer = FishTokenizer.from_pretrained(path)\n            config.semantic_begin_id = tokenizer.semantic_begin_id\n            config.semantic_end_id = tokenizer.semantic_end_id\n            logger.info(\n                f\"Injected Semantic IDs into Config: {config.semantic_begin_id}-{config.semantic_end_id}\"\n            )\n        except Exception as e:\n            logger.warning(\n                f\"Failed to load tokenizer for config injection: {e}. Semantic IDs might be 0.\"\n            )\n\n        match config.model_type:\n            case \"naive\":\n                model_cls = NaiveTransformer\n            case \"dual_ar\":\n                model_cls = DualARTransformer\n            case _:\n                raise ValueError(f\"Unknown model type: {config.model_type}\")\n\n        logger.info(f\"Loading model from {path}, config: {config}\")\n        # Initialize model without passing tokenizer explicitly to __init__\n        model = model_cls(config)\n        # Attach tokenizer to model instance for inference convenience (optional, but good for user scripts)\n        model.tokenizer = tokenizer\n\n        if load_weights is False:\n            logger.info(\"Randomly initialized model\")\n        else:\n            if \"int8\" in str(Path(path)):\n                logger.info(\"Using int8 weight-only quantization!\")\n                from tools.llama.quantize import WeightOnlyInt8QuantHandler\n\n                simple_quantizer = WeightOnlyInt8QuantHandler(model)\n                model = simple_quantizer.convert_for_runtime()\n\n            if \"int4\" in str(Path(path)):\n                logger.info(\"Using int4 quantization!\")\n                path_comps = path.name.split(\"-\")\n                assert path_comps[-2].startswith(\"g\")\n                groupsize = int(path_comps[-2][1:])\n                from tools.llama.quantize import WeightOnlyInt4QuantHandler\n\n                simple_quantizer = WeightOnlyInt4QuantHandler(model, groupsize)\n                model = simple_quantizer.convert_for_runtime()\n\n            path_obj = Path(path)\n            index_json = path_obj / \"model.safetensors.index.json\"\n            single_st = path_obj / \"model.safetensors\"\n            pth_file = path_obj / \"model.pth\"\n\n            if index_json.exists():\n                logger.info(\"Loading sharded safetensors weights\")\n                from safetensors.torch import load_file as st_load_file\n\n                with open(index_json) as f:\n                    st_index = json.load(f)\n                shard_files = sorted(set(st_index[\"weight_map\"].values()))\n                weights = OrderedDict()\n                for shard in shard_files:\n                    weights.update(st_load_file(str(path_obj / shard), device=\"cpu\"))\n                weights = _remap_fish_qwen3_omni_keys(weights)\n            elif single_st.exists():\n                logger.info(\"Loading single safetensors weights\")\n                from safetensors.torch import load_file as st_load_file\n\n                weights = OrderedDict(st_load_file(str(single_st), device=\"cpu\"))\n                weights = _remap_fish_qwen3_omni_keys(weights)\n            elif pth_file.exists():\n                weights = torch.load(\n                    pth_file,\n                    map_location=\"cpu\",\n                    mmap=True,\n                    weights_only=True,\n                )\n                if \"state_dict\" in weights:\n                    weights = weights[\"state_dict\"]\n                if weights and next(iter(weights.keys())).startswith(\"model.\"):\n                    weights = OrderedDict(\n                        (k.replace(\"model.\", \"\"), v) for k, v in weights.items()\n                    )\n                for k in list(weights.keys()):\n                    if \"audio_\" in k:\n                        weights.pop(k)\n            else:\n                raise FileNotFoundError(f\"No model weights found in {path_obj}\")\n\n            err = model.load_state_dict(weights, strict=False, assign=True)\n            logger.info(f\"Model weights loaded - Status: {err}\")\n\n        if lora_config is not None:\n            setup_lora(model, lora_config)\n            logger.info(f\"LoRA setup: {lora_config}\")\n\n        return model\n\n    def save_pretrained(self, path: str, drop_lora: bool = False):\n        path = Path(path)\n        path.mkdir(parents=True, exist_ok=True)\n\n        self.config.save(path / \"config.json\")\n        state_dict = self.state_dict()\n\n        if drop_lora:\n            for key in list(state_dict.keys()):\n                if \"lora\" not in key:\n                    continue\n                state_dict.pop(key)\n\n        torch.save(state_dict, path / \"model.pth\")\n        if hasattr(self, \"tokenizer\"):\n            self.tokenizer.save_pretrained(path)\n\n\nclass NaiveTransformer(BaseTransformer):\n    def __init__(self, config: NaiveModelArgs) -> None:\n        super().__init__(config, init_weights=False)\n\n        self.codebook_norm = RMSNorm(config.dim, eps=config.norm_eps)\n        self.codebook_output = nn.Linear(\n            config.dim,\n            config.codebook_size * config.num_codebooks,\n            bias=False,\n        )\n\n        self.apply(self._init_weights)\n\n    def decode(self, result: BaseTransformerForwardResult) -> TransformerForwardResult:\n        token_logits = result.logits\n        x = result.hidden_states\n\n        # Codebook\n        codebook_logits = self.codebook_output(self.codebook_norm(x))\n        codebook_logits = rearrange(\n            codebook_logits, \"b n (c d) -> b n c d\", c=self.config.num_codebooks\n        )\n\n        return TransformerForwardResult(\n            token_logits=token_logits,\n            codebook_logits=codebook_logits,\n        )\n\n    def forward(\n        self,\n        inp: Tensor,\n        key_padding_mask: Optional[Tensor] = None,\n    ) -> TransformerForwardResult:\n        result = super().forward(\n            inp=inp,\n            key_padding_mask=key_padding_mask,\n        )\n        return self.decode(result)\n\n    def forward_generate(\n        self, x: Tensor, input_pos: Optional[Tensor] = None\n    ) -> TransformerForwardResult:\n        result = super().forward_generate(x, input_pos)\n        return self.decode(result)\n\n\nclass DualARTransformer(BaseTransformer):\n    def __init__(self, config: NaiveModelArgs) -> None:\n        super().__init__(config, init_weights=False)\n\n        # Project to fast dim if needed\n        if config.fast_dim is not None and config.fast_dim != config.dim:\n            self.fast_project_in = nn.Linear(config.dim, config.fast_dim)\n        else:\n            self.fast_project_in = nn.Identity()\n\n        # Fast transformer\n        self.fast_embeddings = nn.Embedding(config.codebook_size, config.fast_dim)\n\n        # The equivalent bs is so large that sdpa doesn't work\n        override_config = dataclasses.replace(\n            config,\n            dim=config.fast_dim,\n            n_head=config.fast_n_head,\n            n_local_heads=config.fast_n_local_heads,\n            head_dim=config.fast_head_dim,\n            intermediate_size=config.fast_intermediate_size,\n            attention_qkv_bias=config.fast_attention_qkv_bias,\n            attention_qk_norm=config.fast_attention_qk_norm,\n            attention_o_bias=config.fast_attention_o_bias,\n        )\n\n        self.fast_layers = nn.ModuleList(\n            TransformerBlock(override_config, use_sdpa=False)\n            for _ in range(config.n_fast_layer)\n        )\n        self.fast_norm = RMSNorm(config.fast_dim, eps=config.norm_eps)\n        self.fast_output = nn.Linear(\n            config.fast_dim,\n            config.codebook_size,\n            bias=False,\n        )\n\n        self.register_buffer(\n            \"fast_freqs_cis\",\n            precompute_freqs_cis(\n                config.num_codebooks,\n                config.fast_head_dim,\n                config.rope_base,\n            ),\n            persistent=False,\n        )\n        self.apply(self._init_weights)\n\n    def setup_caches(\n        self, max_batch_size: int, max_seq_len: int, dtype: torch.dtype = torch.bfloat16\n    ):\n        super().setup_caches(max_batch_size, max_seq_len, dtype)\n\n        # Fast transformer\n        # The max seq len here is the number of codebooks\n        for b in self.fast_layers:\n            b.attention.kv_cache = KVCache(\n                max_batch_size,\n                self.config.num_codebooks,\n                self.config.fast_n_local_heads,\n                self.config.fast_head_dim,\n                dtype=dtype,\n            )\n\n    def forward(\n        self,\n        inp: Tensor,\n        labels: Optional[Tensor] = None,\n        key_padding_mask: Optional[Tensor] = None,\n        vq_parts: Optional[Tensor] = None,\n        vq_masks: Optional[Tensor] = None,\n        vq_require_losses: Optional[Tensor] = None,\n        mel_parts: Optional[Tensor] = None,\n        mel_masks: Optional[Tensor] = None,\n    ) -> TransformerForwardResult:\n        parent_result = super().forward(\n            inp=inp,\n            key_padding_mask=key_padding_mask,\n        )\n        token_logits = parent_result.logits\n        x = parent_result.hidden_states\n\n        # Fast transformer\n        fast_seq_len = self.config.num_codebooks\n        fast_mask = self.causal_mask[\n            None, None, :fast_seq_len, :fast_seq_len\n        ]  # (B, N, Q, K)\n        fast_freqs_cis = self.fast_freqs_cis[:fast_seq_len]\n\n        # Extract corresponding parts with labels\n        token_labels = labels[:, 0]\n\n        # [MODIFIED] Use config instead of tokenizer\n        codebook_mask = (token_labels >= self.config.semantic_begin_id) & (\n            token_labels <= self.config.semantic_end_id\n        )\n\n        # This gives where input token is <|semantic|>\n        x = x[codebook_mask]\n\n        if x.shape[0] == 0:\n            # Use dummy input when no vq is required\n            x = torch.zeros(\n                (4, self.config.dim),\n                device=x.device,\n                dtype=x.dtype,\n            )\n            codebooks = torch.zeros(\n                (x.shape[0], self.config.num_codebooks - 1),\n                device=x.device,\n                dtype=torch.int,\n            )\n        else:\n            all_codebooks = labels[:, 1:, :]\n            all_codebooks_permuted = all_codebooks.permute(0, 2, 1)\n            semantic_codebooks = all_codebooks_permuted[codebook_mask]\n            codebooks = semantic_codebooks[:, :-1]\n\n        x = self.fast_project_in(x)\n        codebook_embeddings = self.fast_embeddings(codebooks)\n        x = torch.cat([x[:, None], codebook_embeddings], dim=1)\n\n        for layer in self.fast_layers:\n            if self.config.use_gradient_checkpointing and self.training:\n                x = checkpoint(layer, x, fast_freqs_cis, fast_mask, use_reentrant=True)\n            else:\n                x = layer(x, fast_freqs_cis, fast_mask)\n\n        # unflatten the batch and num_codebooks\n        fast_out = self.fast_norm(x)\n        codebook_logits = self.fast_output(fast_out)\n\n        assert codebook_logits.shape[1] == self.config.num_codebooks\n\n        return TransformerForwardResult(\n            token_logits=token_logits,\n            codebook_logits=codebook_logits,\n        )\n\n    def forward_generate_fast(\n        self, x: Tensor, input_pos: Optional[Tensor] = None\n    ) -> Tensor:\n        # Fast transformer\n        x = x.view(x.shape[0], 1, -1)\n\n        fast_mask = self.causal_mask[\n            None, None, input_pos, : self.config.num_codebooks\n        ]  # (B, N, Q, K)\n        fast_freqs_cis = self.fast_freqs_cis[input_pos]\n\n        for layer in self.fast_layers:\n            x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)\n\n        # unflatten the batch and num_codebooks\n        fast_out = self.fast_norm(x)  # only take the last token\n        codebook_logits = self.fast_output(fast_out)\n\n        return codebook_logits\n\n    def forward_generate(\n        self,\n        x: Tensor,\n        input_pos: Optional[Tensor] = None,\n        audio_masks: Optional[Tensor] = None,\n        audio_parts: Optional[Tensor] = None,\n    ) -> TransformerForwardResult:\n        x = super().forward_generate(x, input_pos, audio_masks, audio_parts)\n        x.hidden_states = self.fast_project_in(x.hidden_states)\n        return x\n\n\nclass TransformerBlock(nn.Module):\n    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:\n        super().__init__()\n        self.attention = Attention(config, use_sdpa=use_sdpa)\n        self.feed_forward = FeedForward(config)\n        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)\n        self.attention_norm = RMSNorm(config.dim, config.norm_eps)\n\n    def forward(\n        self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Tensor = None\n    ) -> Tensor:\n        h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)\n        out = h + self.feed_forward(self.ffn_norm(h))\n        return out\n\n\nclass Attention(nn.Module):\n    def __init__(self, config: BaseModelArgs, use_sdpa: bool = True):\n        super().__init__()\n        assert config.dim % config.n_head == 0\n\n        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim\n        # key, query, value projections for all heads, but in a batch\n        self.wqkv = nn.Linear(\n            config.dim, total_head_dim, bias=config.attention_qkv_bias\n        )\n        self.wo = nn.Linear(\n            config.n_head * config.head_dim, config.dim, bias=config.attention_o_bias\n        )\n        self.kv_cache = None\n\n        if config.attention_qk_norm:\n            self.q_norm = nn.RMSNorm(config.head_dim, config.norm_eps)\n            self.k_norm = nn.RMSNorm(config.head_dim, config.norm_eps)\n\n        self.dropout = config.dropout\n        self.n_head = config.n_head\n        self.head_dim = config.head_dim\n        self.n_local_heads = config.n_local_heads\n        self.dim = config.dim\n        self.use_sdpa = use_sdpa\n        self.attention_qk_norm = config.attention_qk_norm\n        self.config = config\n\n        self._register_load_state_dict_pre_hook(self.load_hook)\n\n    def load_hook(self, state_dict, prefix, *args):\n        if prefix + \"wq.weight\" in state_dict:\n            wq = state_dict.pop(prefix + \"wq.weight\")\n            wk = state_dict.pop(prefix + \"wk.weight\")\n            wv = state_dict.pop(prefix + \"wv.weight\")\n            state_dict[prefix + \"wqkv.weight\"] = torch.cat([wq, wk, wv])\n\n    def forward(\n        self,\n        x: Tensor,\n        freqs_cis: Tensor,\n        mask: Tensor,\n        input_pos: Optional[Tensor] = None,\n    ) -> Tensor:\n        bsz, seqlen, _ = x.shape\n\n        q_size = self.n_head * self.head_dim\n        kv_size = self.n_local_heads * self.head_dim\n        q, k, v = self.wqkv(x).split([q_size, kv_size, kv_size], dim=-1)\n\n        q = q.view(bsz, seqlen, self.n_head, self.head_dim)\n        k = k.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n        v = v.view(bsz, seqlen, self.n_local_heads, self.head_dim)\n\n        if self.attention_qk_norm:\n            q = self.q_norm(q)\n            k = self.k_norm(k)\n\n        q = apply_rotary_emb(q, freqs_cis)\n        k = apply_rotary_emb(k, freqs_cis)\n\n        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))\n\n        if self.kv_cache is not None:\n            k, v = self.kv_cache.update(input_pos, k, v)\n\n        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)\n        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)\n\n        if self.use_sdpa:\n            if mask is None:\n                with sdpa_kernel(SDPBackend.FLASH_ATTENTION):\n                    y = F.scaled_dot_product_attention(\n                        q,\n                        k,\n                        v,\n                        dropout_p=self.dropout if self.training else 0.0,\n                        is_causal=True,\n                        # No third party attn_mask here to use flash_attention\n                    )\n            else:\n                y = F.scaled_dot_product_attention(\n                    q,\n                    k,\n                    v,\n                    attn_mask=mask,\n                    dropout_p=self.dropout if self.training else 0.0,\n                )\n        else:\n            y = self.eq_scaled_dot_product_attention(\n                q,\n                k,\n                v,\n                attn_mask=mask,\n                dropout_p=self.dropout if self.training else 0.0,\n            )\n\n        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, q_size)\n\n        return self.wo(y)\n\n    def eq_scaled_dot_product_attention(\n        self,\n        query,\n        key,\n        value,\n        attn_mask=None,\n        dropout_p=0.0,\n    ) -> torch.Tensor:\n        # This is a standard scaled dot product attention\n        # It's low efficient, but it doesn't raise cuda error\n\n        L, S = query.size(-2), key.size(-2)\n        scale_factor = 1 / math.sqrt(query.size(-1))\n        attn_bias = torch.zeros(1, 1, L, S, dtype=query.dtype, device=query.device)\n\n        if attn_mask is not None:\n            if attn_mask.dtype == torch.bool:\n                attn_bias = torch.where(\n                    attn_mask.logical_not(), float(\"-inf\"), attn_bias\n                )\n            else:\n                attn_bias = attn_bias + attn_mask\n\n        attn_weight = query @ key.transpose(-2, -1) * scale_factor\n        attn_weight += attn_bias\n        attn_weight = torch.softmax(attn_weight, dim=-1)\n        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)\n\n        return attn_weight @ value\n\n\nclass FeedForward(nn.Module):\n    def __init__(self, config: BaseModelArgs) -> None:\n        super().__init__()\n        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)\n        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)\n        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)\n\n    def forward(self, x: Tensor) -> Tensor:\n        return self.w2(F.silu(self.w1(x)) * self.w3(x))\n\n\nclass RMSNorm(nn.Module):\n    def __init__(self, dim: int, eps: float = 1e-5):\n        super().__init__()\n        self.eps = eps\n        self.weight = nn.Parameter(torch.ones(dim))\n\n    def _norm(self, x):\n        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)\n\n    def forward(self, x: Tensor) -> Tensor:\n        output = self._norm(x.float()).type_as(x)\n        return output * self.weight\n\n\ndef precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000) -> Tensor:\n    \"\"\"\n    Precomputes frequency tensors for complex exponentials (cis)\n\n    Args:\n        seq_len: Length of the sequence for which positional embeddings are needed.\n        n_elem: Number of elements in the frequency tensor.\n        base: Base value for the frequency scaling (default: 10000).\n\n    Returns:\n        A tensor containing the precomputed frequencies in real and imaginary parts (bfloat16).\n    \"\"\"\n    freqs = 1.0 / (\n        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)\n    )\n    t = torch.arange(seq_len, device=freqs.device)\n    freqs = torch.outer(t, freqs)\n    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)\n    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)\n    return cache.to(dtype=torch.bfloat16)\n\n\ndef apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:\n    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)\n    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)\n    x_out2 = torch.stack(\n        [\n            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],\n            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],\n        ],\n        -1,\n    )\n\n    x_out2 = x_out2.flatten(3)\n    return x_out2.type_as(x)\n"
  },
  {
    "path": "fish_speech/models/text2semantic/lora.py",
    "content": "from dataclasses import dataclass\n\nimport loralib as lora\n\n\n@dataclass\nclass LoraConfig:\n    r: int\n    lora_alpha: float\n    lora_dropout: float = 0.0\n\n\ndef _replace_embedding(old_embed, lora_config):\n    new_embed = lora.Embedding(\n        num_embeddings=old_embed.num_embeddings,\n        embedding_dim=old_embed.embedding_dim,\n        padding_idx=old_embed.padding_idx,\n        r=lora_config.r,\n        lora_alpha=lora_config.lora_alpha,\n    )\n    new_embed.weight.data.copy_(old_embed.weight.data)\n    return new_embed\n\n\ndef setup_lora(model, lora_config):\n    # Replace the embedding layer with a LoRA layer, preserving pretrained weights\n    model.embeddings = _replace_embedding(model.embeddings, lora_config)\n    model.codebook_embeddings = _replace_embedding(\n        model.codebook_embeddings, lora_config\n    )\n\n    # Replace output layer with a LoRA layer\n    linears = [(model, \"output\")]\n\n    # Replace all linear layers with LoRA layers\n    for layer in model.layers:\n        linears.extend([(layer.attention, \"wqkv\"), (layer.attention, \"wo\")])\n        linears.extend(\n            [\n                (layer.feed_forward, \"w1\"),\n                (layer.feed_forward, \"w2\"),\n                (layer.feed_forward, \"w3\"),\n            ]\n        )\n\n    if hasattr(model, \"fast_layers\"):\n        model.fast_embeddings = _replace_embedding(model.fast_embeddings, lora_config)\n\n        # Dual-AR model\n        linears.append((model, \"fast_output\"))\n\n        for layer in model.fast_layers:\n            linears.extend([(layer.attention, \"wqkv\"), (layer.attention, \"wo\")])\n            linears.extend(\n                [\n                    (layer.feed_forward, \"w1\"),\n                    (layer.feed_forward, \"w2\"),\n                    (layer.feed_forward, \"w3\"),\n                ]\n            )\n\n    for module, layer_name in linears:\n        old_linear = getattr(module, layer_name)\n        updated_linear = lora.Linear(\n            in_features=old_linear.in_features,\n            out_features=old_linear.out_features,\n            bias=old_linear.bias is not None,\n            r=lora_config.r,\n            lora_alpha=lora_config.lora_alpha,\n            lora_dropout=lora_config.lora_dropout,\n        )\n        updated_linear.weight.data.copy_(old_linear.weight.data)\n        if old_linear.bias is not None:\n            updated_linear.bias.data.copy_(old_linear.bias.data)\n        setattr(module, layer_name, updated_linear)\n\n    # Mark only the LoRA layers as trainable\n    lora.mark_only_lora_as_trainable(model, bias=\"none\")\n\n\ndef get_merged_state_dict(model):\n    # This line will merge the state dict of the model and the LoRA parameters\n    model.eval()\n\n    # Then we need to remove the LoRA parameters from the state dict\n    state_dict = model.state_dict()\n    for name in list(state_dict.keys()):\n        if \"lora\" in name:\n            state_dict.pop(name)\n\n    return state_dict\n"
  },
  {
    "path": "fish_speech/scheduler.py",
    "content": "import math\n\n\ndef get_cosine_schedule_with_warmup_lr_lambda(\n    current_step: int,\n    *,\n    num_warmup_steps: int | float,\n    num_training_steps: int,\n    num_cycles: float = 0.5,\n    final_lr_ratio: float = 0.0,\n):\n    if 0 < num_warmup_steps < 1:  # float mode\n        num_warmup_steps = int(num_warmup_steps * num_training_steps)\n\n    if current_step < num_warmup_steps:\n        return float(current_step) / float(max(1, num_warmup_steps))\n\n    progress = float(current_step - num_warmup_steps) / float(\n        max(1, num_training_steps - num_warmup_steps)\n    )\n\n    return max(\n        final_lr_ratio,\n        0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),\n    )\n\n\ndef get_constant_schedule_with_warmup_lr_lambda(\n    current_step: int,\n    *,\n    num_warmup_steps: int | float,\n    num_training_steps: int | None = None,\n):\n    if 0 < num_warmup_steps < 1:  # float mode\n        num_warmup_steps = int(num_warmup_steps * num_training_steps)\n\n    if current_step < num_warmup_steps:\n        return float(current_step) / float(max(1, num_warmup_steps))\n\n    return 1.0\n"
  },
  {
    "path": "fish_speech/text/__init__.py",
    "content": "from .clean import clean_text\n\n__all__ = [\"clean_text\"]\n"
  },
  {
    "path": "fish_speech/text/clean.py",
    "content": "import re\n\nSYMBOLS_MAPPING = {\n    \"‘\": \"'\",\n    \"’\": \"'\",\n}\n\nREPLACE_SYMBOL_REGEX = re.compile(\n    \"|\".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())\n)\n\n\nEMOJI_REGEX = re.compile(\n    \"[\"\n    \"\\U0001f600-\\U0001f64f\"  # emoticons\n    \"\\U0001f300-\\U0001f5ff\"  # symbols & pictographs\n    \"\\U0001f680-\\U0001f6ff\"  # transport & map symbols\n    \"\\U0001f1e0-\\U0001f1ff\"  # flags (iOS)\n    \"]+\",\n    flags=re.UNICODE,\n)\n\n\ndef clean_text(text):\n    # Clean the text\n    text = text.strip()\n\n    # Replace all chinese symbols with their english counterparts\n    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)\n\n    # Remove emojis\n    text = EMOJI_REGEX.sub(r\"\", text)\n\n    # Remove continuous periods (...) and commas (,,,)\n    text = re.sub(r\"[,]{2,}\", lambda m: m.group()[0], text)\n\n    return text\n"
  },
  {
    "path": "fish_speech/tokenizer.py",
    "content": "import json\nimport logging\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, List, Union\n\nimport torch\nfrom transformers import AutoTokenizer\n\nif TYPE_CHECKING:\n    from transformers import PreTrainedTokenizerFast\n\nlogger = logging.getLogger(__name__)\n\n# Constants definitions\nEOS_TOKEN = \"<|endoftext|>\"\nPAD_TOKEN = \"<|pad|>\"\nIM_START_TOKEN = \"<|im_start|>\"\nIM_END_TOKEN = \"<|im_end|>\"\nPHONEME_START_TOKEN = \"<|phoneme_start|>\"\nPHONEME_END_TOKEN = \"<|phoneme_end|>\"\n\nMODALITY_TEXT_TOKEN = \"<|text|>\"\nMODALITY_VOICE_TOKEN = \"<|voice|>\"\nMODALITY_INTERLEAVE_TOKEN = \"<|interleave|>\"\nAUDIO_START_TOKEN = \"<|audio_start|>\"\nAUDIO_END_TOKEN = \"<|audio_end|>\"\nAUDIO_EMBED_TOKEN = \"<|audio_pad|>\"\n\nMODALITY_TOKENS = {\n    \"text\": MODALITY_TEXT_TOKEN,\n    \"voice\": MODALITY_VOICE_TOKEN,\n    \"interleave\": MODALITY_INTERLEAVE_TOKEN,\n}\n\nSEMANTIC_TOKEN_TEMPLATE = \"<|semantic:{i}|>\"\nSEMANTIC_TOKENS = [SEMANTIC_TOKEN_TEMPLATE.format(i=i) for i in range(4096)]\n\nALL_SPECIAL_TOKENS = [\n    EOS_TOKEN,\n    PAD_TOKEN,\n    IM_START_TOKEN,\n    IM_END_TOKEN,\n    PHONEME_START_TOKEN,\n    PHONEME_END_TOKEN,\n    MODALITY_TEXT_TOKEN,\n    MODALITY_VOICE_TOKEN,\n    MODALITY_INTERLEAVE_TOKEN,\n    AUDIO_START_TOKEN,\n    AUDIO_END_TOKEN,\n    AUDIO_EMBED_TOKEN,\n    *SEMANTIC_TOKENS,\n]\n\n\nclass FishTokenizer:\n    def __init__(self, model_path: str):\n        self._tokenizer = AutoTokenizer.from_pretrained(model_path)\n        self.semantic_id_to_token_id = {}\n\n        vocab = self._tokenizer.get_vocab()\n        valid_ids = []\n\n        for code_idx in range(4096):\n            token = SEMANTIC_TOKEN_TEMPLATE.format(i=code_idx)\n            if token in vocab:\n                token_id = vocab[token]\n                self.semantic_id_to_token_id[code_idx] = token_id\n                valid_ids.append(token_id)\n\n        if not valid_ids:\n            logger.error(\n                \"CRITICAL ERROR: No semantic tokens found in vocab! Audio cannot be synthesized.\"\n            )\n            self.semantic_begin_id = 0\n            self.semantic_end_id = 0\n            # Dummy tensor to prevent crash, though generation will fail\n            self.semantic_map_tensor = torch.zeros(4096, dtype=torch.long)\n        else:\n            self.semantic_begin_id = min(valid_ids)\n            self.semantic_end_id = max(valid_ids)\n            # Create a lookup tensor to handle potential gaps in token IDs safely\n            self.semantic_map_tensor = torch.zeros(4096, dtype=torch.long)\n            for k, v in self.semantic_id_to_token_id.items():\n                self.semantic_map_tensor[k] = v\n\n        logger.info(\n            f\"Loaded Tokenizer. Semantic Range: {self.semantic_begin_id} -> {self.semantic_end_id}\"\n        )\n\n    @property\n    def vocab_size(self):\n        return self._tokenizer.vocab_size\n\n    @property\n    def pad_token_id(self):\n        return self._tokenizer.pad_token_id\n\n    @property\n    def eos_token_id(self):\n        return self._tokenizer.eos_token_id\n\n    def get_token_id(self, token: str) -> int:\n        return self._tokenizer.convert_tokens_to_ids(token)\n\n    def encode(\n        self, text: str, add_special_tokens: bool = False, **kwargs\n    ) -> List[int]:\n        # [FIX] Force Qwen/Tiktoken backends to parse special tokens inline\n        import inspect\n\n        sig = inspect.signature(self._tokenizer.encode)\n        if \"allowed_special\" in sig.parameters and \"allowed_special\" not in kwargs:\n            kwargs[\"allowed_special\"] = \"all\"\n        return self._tokenizer.encode(\n            text, add_special_tokens=add_special_tokens, **kwargs\n        )\n\n    def decode(self, tokens: Union[List[int], int], **kwargs) -> str:\n        return self._tokenizer.decode(tokens, **kwargs)\n\n    def save_pretrained(self, path: str):\n        self._tokenizer.save_pretrained(path)\n\n    @classmethod\n    def from_pretrained(cls, path: str):\n        return cls(path)\n\n    def __getattr__(self, name):\n        return getattr(self._tokenizer, name)\n"
  },
  {
    "path": "fish_speech/train.py",
    "content": "import os\n\nos.environ[\"USE_LIBUV\"] = \"0\"\nimport sys\nfrom typing import Optional\n\nimport hydra\nimport lightning as L\nimport pyrootutils\nimport torch\nfrom lightning import Callback, LightningDataModule, LightningModule, Trainer\nfrom lightning.pytorch.loggers import Logger\nfrom lightning.pytorch.strategies import DDPStrategy\nfrom omegaconf import DictConfig, OmegaConf\n\nos.environ.pop(\"SLURM_NTASKS\", None)\nos.environ.pop(\"SLURM_JOB_NAME\", None)\nos.environ.pop(\"SLURM_NTASKS_PER_NODE\", None)\n\n# register eval resolver and root\npyrootutils.setup_root(__file__, indicator=\".project-root\", pythonpath=True)\n\n# Allow TF32 on Ampere GPUs\ntorch.set_float32_matmul_precision(\"high\")\ntorch.backends.cudnn.allow_tf32 = True\n\n# register eval resolver\nOmegaConf.register_new_resolver(\"eval\", eval)\n\nimport fish_speech.utils as utils\n\nlog = utils.RankedLogger(__name__, rank_zero_only=True)\n\n\n@utils.task_wrapper\ndef train(cfg: DictConfig) -> tuple[dict, dict]:\n    \"\"\"Trains the model. Can additionally evaluate on a testset, using best weights obtained during\n    training.\n    This method is wrapped in optional @task_wrapper decorator, that controls the behavior during\n    failure. Useful for multiruns, saving info about the crash, etc.\n    Args:\n        cfg (DictConfig): Configuration composed by Hydra.\n    Returns:\n        Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects.\n    \"\"\"  # noqa: E501\n\n    # set seed for random number generators in pytorch, numpy and python.random\n    if cfg.get(\"seed\"):\n        L.seed_everything(cfg.seed, workers=False)\n\n    if cfg.get(\"deterministic\"):\n        torch.use_deterministic_algorithms(True)\n\n    log.info(f\"Instantiating datamodule <{cfg.data._target_}>\")\n    datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)\n\n    log.info(f\"Instantiating model <{cfg.model._target_}>\")\n    model: LightningModule = hydra.utils.instantiate(cfg.model)\n\n    log.info(\"Instantiating callbacks...\")\n    callbacks: list[Callback] = utils.instantiate_callbacks(cfg.get(\"callbacks\"))\n\n    log.info(\"Instantiating loggers...\")\n    logger: list[Logger] = utils.instantiate_loggers(cfg.get(\"logger\"))\n\n    log.info(f\"Instantiating trainer <{cfg.trainer._target_}>\")\n    trainer: Trainer = hydra.utils.instantiate(\n        cfg.trainer,\n        callbacks=callbacks,\n        logger=logger,\n    )\n\n    object_dict = {\n        \"cfg\": cfg,\n        \"datamodule\": datamodule,\n        \"model\": model,\n        \"callbacks\": callbacks,\n        \"logger\": logger,\n        \"trainer\": trainer,\n    }\n\n    if logger:\n        log.info(\"Logging hyperparameters!\")\n        utils.log_hyperparameters(object_dict)\n\n    if cfg.get(\"train\"):\n        log.info(\"Starting training!\")\n\n        ckpt_path = cfg.get(\"ckpt_path\")\n        auto_resume = False\n\n        resume_ckpt_path = utils.get_latest_checkpoint(cfg.paths.ckpt_dir)\n        if resume_ckpt_path is not None:\n            ckpt_path = resume_ckpt_path\n            auto_resume = True\n\n        if ckpt_path is not None:\n            log.info(f\"Resuming from checkpoint: {ckpt_path}\")\n\n        # resume weights only is disabled for auto-resume\n        if cfg.get(\"resume_weights_only\") and auto_resume is False:\n            log.info(\"Resuming weights only!\")\n            ckpt = torch.load(ckpt_path, map_location=model.device)\n            if \"state_dict\" in ckpt:\n                ckpt = ckpt[\"state_dict\"]\n            err = model.load_state_dict(ckpt, strict=False)\n            log.info(f\"Error loading state dict: {err}\")\n            ckpt_path = None\n\n        trainer.fit(model=model, datamodule=datamodule, ckpt_path=ckpt_path)\n\n    train_metrics = trainer.callback_metrics\n\n    if cfg.get(\"test\"):\n        log.info(\"Starting testing!\")\n        ckpt_path = trainer.checkpoint_callback.best_model_path\n        if ckpt_path == \"\":\n            log.warning(\"Best ckpt not found! Using current weights for testing...\")\n            ckpt_path = cfg.get(\"ckpt_path\")\n\n        trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)\n        log.info(f\"Best ckpt path: {ckpt_path}\")\n\n    test_metrics = trainer.callback_metrics\n\n    # merge train and test metrics\n    metric_dict = {**train_metrics, **test_metrics}\n\n    return metric_dict, object_dict\n\n\n@hydra.main(\n    version_base=\"1.3\", config_path=\"./configs\", config_name=\"llama_pretrain.yaml\"\n)\ndef main(cfg: DictConfig) -> Optional[float]:\n    # train the model\n    train(cfg)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "fish_speech/utils/__init__.py",
    "content": "from .braceexpand import braceexpand\nfrom .context import autocast_exclude_mps\nfrom .file import get_latest_checkpoint\nfrom .instantiators import instantiate_callbacks, instantiate_loggers\nfrom .logger import RankedLogger\nfrom .logging_utils import log_hyperparameters\nfrom .rich_utils import enforce_tags, print_config_tree\nfrom .utils import extras, get_metric_value, set_seed, task_wrapper\n\n__all__ = [\n    \"enforce_tags\",\n    \"extras\",\n    \"get_metric_value\",\n    \"RankedLogger\",\n    \"instantiate_callbacks\",\n    \"instantiate_loggers\",\n    \"log_hyperparameters\",\n    \"print_config_tree\",\n    \"task_wrapper\",\n    \"braceexpand\",\n    \"get_latest_checkpoint\",\n    \"autocast_exclude_mps\",\n    \"set_seed\",\n]\n"
  },
  {
    "path": "fish_speech/utils/braceexpand.py",
    "content": "\"\"\"\nBash-style brace expansion\nCopied from: https://github.com/trendels/braceexpand/blob/main/src/braceexpand/__init__.py\nLicense: MIT\n\"\"\"\n\nimport re\nimport string\nfrom itertools import chain, product\nfrom typing import Iterable, Iterator, Optional\n\n__all__ = [\"braceexpand\", \"alphabet\", \"UnbalancedBracesError\"]\n\n\nclass UnbalancedBracesError(ValueError):\n    pass\n\n\nalphabet = string.ascii_uppercase + string.ascii_lowercase\n\nint_range_re = re.compile(r\"^(-?\\d+)\\.\\.(-?\\d+)(?:\\.\\.-?(\\d+))?$\")\nchar_range_re = re.compile(r\"^([A-Za-z])\\.\\.([A-Za-z])(?:\\.\\.-?(\\d+))?$\")\nescape_re = re.compile(r\"\\\\(.)\")\n\n\ndef braceexpand(pattern: str, escape: bool = True) -> Iterator[str]:\n    \"\"\"braceexpand(pattern) -> iterator over generated strings\n\n    Returns an iterator over the strings resulting from brace expansion\n    of pattern. This function implements Brace Expansion as described in\n    bash(1), with the following limitations:\n\n    * A pattern containing unbalanced braces will raise an\n      UnbalancedBracesError exception. In bash, unbalanced braces will either\n      be partly expanded or ignored.\n\n    * A mixed-case character range like '{Z..a}' or '{a..Z}' will not\n      include the characters '[]^_`' between 'Z' and 'a'.\n\n    When escape is True (the default), characters in pattern can be\n    prefixed with a backslash to cause them not to be interpreted as\n    special characters for brace expansion (such as '{', '}', ',').\n    To pass through a a literal backslash, double it ('\\\\\\\\').\n\n    When escape is False, backslashes in pattern have no special\n    meaning and will be preserved in the output.\n\n    Examples:\n\n    >>> from braceexpand import braceexpand\n\n    # Integer range\n    >>> list(braceexpand('item{1..3}'))\n    ['item1', 'item2', 'item3']\n\n    # Character range\n    >>> list(braceexpand('{a..c}'))\n    ['a', 'b', 'c']\n\n    # Sequence\n    >>> list(braceexpand('index.html{,.backup}'))\n    ['index.html', 'index.html.backup']\n\n    # Nested patterns\n    >>> list(braceexpand('python{2.{5..7},3.{2,3}}'))\n    ['python2.5', 'python2.6', 'python2.7', 'python3.2', 'python3.3']\n\n    # Prefixing an integer with zero causes all numbers to be padded to\n    # the same width.\n    >>> list(braceexpand('{07..10}'))\n    ['07', '08', '09', '10']\n\n    # An optional increment can be specified for ranges.\n    >>> list(braceexpand('{a..g..2}'))\n    ['a', 'c', 'e', 'g']\n\n    # Ranges can go in both directions.\n    >>> list(braceexpand('{4..1}'))\n    ['4', '3', '2', '1']\n\n    # Numbers can be negative\n    >>> list(braceexpand('{2..-1}'))\n    ['2', '1', '0', '-1']\n\n    # Unbalanced braces raise an exception.\n    >>> list(braceexpand('{1{2,3}'))\n    Traceback (most recent call last):\n        ...\n    UnbalancedBracesError: Unbalanced braces: '{1{2,3}'\n\n    # By default, the backslash is the escape character.\n    >>> list(braceexpand(r'{1\\\\{2,3}'))\n    ['1{2', '3']\n\n    # Setting 'escape' to False disables backslash escaping.\n    >>> list(braceexpand(r'\\\\{1,2}', escape=False))\n    ['\\\\\\\\1', '\\\\\\\\2']\n\n    \"\"\"\n    return (\n        escape_re.sub(r\"\\1\", s) if escape else s for s in parse_pattern(pattern, escape)\n    )\n\n\ndef parse_pattern(pattern: str, escape: bool) -> Iterator[str]:\n    start = 0\n    pos = 0\n    bracketdepth = 0\n    items: list[Iterable[str]] = []\n\n    # print 'pattern:', pattern\n    while pos < len(pattern):\n        if escape and pattern[pos] == \"\\\\\":\n            pos += 2\n            continue\n        elif pattern[pos] == \"{\":\n            if bracketdepth == 0 and pos > start:\n                # print 'literal:', pattern[start:pos]\n                items.append([pattern[start:pos]])\n                start = pos\n            bracketdepth += 1\n        elif pattern[pos] == \"}\":\n            bracketdepth -= 1\n            if bracketdepth == 0:\n                # print 'expression:', pattern[start+1:pos]\n                expr = pattern[start + 1 : pos]\n                item = parse_expression(expr, escape)\n                if item is None:  # not a range or sequence\n                    items.extend([[\"{\"], parse_pattern(expr, escape), [\"}\"]])\n                else:\n                    items.append(item)\n                start = pos + 1  # skip the closing brace\n        pos += 1\n\n    if bracketdepth != 0:  # unbalanced braces\n        raise UnbalancedBracesError(\"Unbalanced braces: '%s'\" % pattern)\n\n    if start < pos:\n        items.append([pattern[start:]])\n\n    return (\"\".join(item) for item in product(*items))\n\n\ndef parse_expression(expr: str, escape: bool) -> Optional[Iterable[str]]:\n    int_range_match = int_range_re.match(expr)\n    if int_range_match:\n        return make_int_range(*int_range_match.groups())\n\n    char_range_match = char_range_re.match(expr)\n    if char_range_match:\n        return make_char_range(*char_range_match.groups())\n\n    return parse_sequence(expr, escape)\n\n\ndef parse_sequence(seq: str, escape: bool) -> Optional[Iterator[str]]:\n    # sequence -> chain(*sequence_items)\n    start = 0\n    pos = 0\n    bracketdepth = 0\n    items: list[Iterable[str]] = []\n\n    # print 'sequence:', seq\n    while pos < len(seq):\n        if escape and seq[pos] == \"\\\\\":\n            pos += 2\n            continue\n        elif seq[pos] == \"{\":\n            bracketdepth += 1\n        elif seq[pos] == \"}\":\n            bracketdepth -= 1\n        elif seq[pos] == \",\" and bracketdepth == 0:\n            items.append(parse_pattern(seq[start:pos], escape))\n            start = pos + 1  # skip the comma\n        pos += 1\n\n    if bracketdepth != 0:\n        raise UnbalancedBracesError\n    if not items:\n        return None\n\n    # part after the last comma (may be the empty string)\n    items.append(parse_pattern(seq[start:], escape))\n    return chain(*items)\n\n\ndef make_int_range(left: str, right: str, incr: Optional[str] = None) -> Iterator[str]:\n    if any([s.startswith((\"0\", \"-0\")) for s in (left, right) if s not in (\"0\", \"-0\")]):\n        padding = max(len(left), len(right))\n    else:\n        padding = 0\n    step = (int(incr) or 1) if incr else 1\n    start = int(left)\n    end = int(right)\n    r = range(start, end + 1, step) if start < end else range(start, end - 1, -step)\n    fmt = \"%0{}d\".format(padding)\n    return (fmt % i for i in r)\n\n\ndef make_char_range(left: str, right: str, incr: Optional[str] = None) -> str:\n    step = (int(incr) or 1) if incr else 1\n    start = alphabet.index(left)\n    end = alphabet.index(right)\n    if start < end:\n        return alphabet[start : end + 1 : step]\n    else:\n        end = end or -len(alphabet)\n        return alphabet[start : end - 1 : -step]\n\n\nif __name__ == \"__main__\":\n    import doctest\n    import sys\n\n    failed, _ = doctest.testmod(optionflags=doctest.IGNORE_EXCEPTION_DETAIL)\n    if failed:\n        sys.exit(1)\n"
  },
  {
    "path": "fish_speech/utils/context.py",
    "content": "from contextlib import nullcontext\n\nimport torch\n\n\ndef autocast_exclude_mps(\n    device_type: str, dtype: torch.dtype\n) -> nullcontext | torch.autocast:\n    return (\n        nullcontext()\n        if torch.backends.mps.is_available()\n        else torch.autocast(device_type, dtype)\n    )\n"
  },
  {
    "path": "fish_speech/utils/file.py",
    "content": "import os\nfrom pathlib import Path\nfrom typing import Union\n\nfrom loguru import logger\nfrom natsort import natsorted\n\nAUDIO_EXTENSIONS = {\n    \".mp3\",\n    \".wav\",\n    \".flac\",\n    \".ogg\",\n    \".m4a\",\n    \".wma\",\n    \".aac\",\n    \".aiff\",\n    \".aif\",\n    \".aifc\",\n}\n\nVIDEO_EXTENSIONS = {\n    \".mp4\",\n    \".avi\",\n}\n\n\ndef get_latest_checkpoint(path: Path | str) -> Path | None:\n    # Find the latest checkpoint\n    ckpt_dir = Path(path)\n\n    if ckpt_dir.exists() is False:\n        return None\n\n    ckpts = sorted(ckpt_dir.glob(\"*.ckpt\"), key=os.path.getmtime)\n    if len(ckpts) == 0:\n        return None\n\n    return ckpts[-1]\n\n\ndef audio_to_bytes(file_path):\n    if not file_path or not Path(file_path).exists():\n        return None\n    with open(file_path, \"rb\") as wav_file:\n        wav = wav_file.read()\n    return wav\n\n\ndef read_ref_text(ref_text):\n    path = Path(ref_text)\n    if path.exists() and path.is_file():\n        with path.open(\"r\", encoding=\"utf-8\") as file:\n            return file.read()\n    return ref_text\n\n\ndef list_files(\n    path: Union[Path, str],\n    extensions: set[str] = set(),\n    recursive: bool = False,\n    sort: bool = True,\n) -> list[Path]:\n    \"\"\"List files in a directory.\n\n    Args:\n        path (Path): Path to the directory.\n        extensions (set, optional): Extensions to filter. Defaults to None.\n        recursive (bool, optional): Whether to search recursively. Defaults to False.\n        sort (bool, optional): Whether to sort the files. Defaults to True.\n\n    Returns:\n        list: List of files.\n    \"\"\"\n\n    if isinstance(path, str):\n        path = Path(path)\n\n    if not path.exists():\n        raise FileNotFoundError(f\"Directory {path} does not exist.\")\n\n    files = [file for ext in extensions for file in path.rglob(f\"*{ext}\")]\n\n    if sort:\n        files = natsorted(files)\n\n    return files\n\n\ndef load_filelist(path: Path | str) -> list[tuple[Path, str, str, str]]:\n    \"\"\"\n    Load a Bert-VITS2 style filelist.\n    \"\"\"\n\n    files = set()\n    results = []\n    count_duplicated, count_not_found = 0, 0\n\n    LANGUAGE_TO_LANGUAGES = {\n        \"zh\": [\"zh\", \"en\"],\n        \"jp\": [\"jp\", \"en\"],\n        \"en\": [\"en\"],\n    }\n\n    with open(path, \"r\", encoding=\"utf-8\") as f:\n        for line in f.readlines():\n            splits = line.strip().split(\"|\", maxsplit=3)\n            if len(splits) != 4:\n                logger.warning(f\"Invalid line: {line}\")\n                continue\n\n            filename, speaker, language, text = splits\n            file = Path(filename)\n            language = language.strip().lower()\n\n            if language == \"ja\":\n                language = \"jp\"\n\n            assert language in [\"zh\", \"jp\", \"en\"], f\"Invalid language {language}\"\n            languages = LANGUAGE_TO_LANGUAGES[language]\n\n            if file in files:\n                logger.warning(f\"Duplicated file: {file}\")\n                count_duplicated += 1\n                continue\n\n            if not file.exists():\n                logger.warning(f\"File not found: {file}\")\n                count_not_found += 1\n                continue\n\n            results.append((file, speaker, languages, text))\n\n    if count_duplicated > 0:\n        logger.warning(f\"Total duplicated files: {count_duplicated}\")\n\n    if count_not_found > 0:\n        logger.warning(f\"Total files not found: {count_not_found}\")\n\n    return results\n"
  },
  {
    "path": "fish_speech/utils/instantiators.py",
    "content": "from typing import List\n\nimport hydra\nfrom omegaconf import DictConfig\nfrom pytorch_lightning import Callback\nfrom pytorch_lightning.loggers import Logger\n\nfrom .logger import RankedLogger\n\nlog = RankedLogger(__name__, rank_zero_only=True)\n\n\ndef instantiate_callbacks(callbacks_cfg: DictConfig) -> List[Callback]:\n    \"\"\"Instantiates callbacks from config.\"\"\"\n\n    callbacks: List[Callback] = []\n\n    if not callbacks_cfg:\n        log.warning(\"No callback configs found! Skipping..\")\n        return callbacks\n\n    if not isinstance(callbacks_cfg, DictConfig):\n        raise TypeError(\"Callbacks config must be a DictConfig!\")\n\n    for _, cb_conf in callbacks_cfg.items():\n        if isinstance(cb_conf, DictConfig) and \"_target_\" in cb_conf:\n            log.info(f\"Instantiating callback <{cb_conf._target_}>\")\n            callbacks.append(hydra.utils.instantiate(cb_conf))\n\n    return callbacks\n\n\ndef instantiate_loggers(logger_cfg: DictConfig) -> List[Logger]:\n    \"\"\"Instantiates loggers from config.\"\"\"\n\n    logger: List[Logger] = []\n\n    if not logger_cfg:\n        log.warning(\"No logger configs found! Skipping...\")\n        return logger\n\n    if not isinstance(logger_cfg, DictConfig):\n        raise TypeError(\"Logger config must be a DictConfig!\")\n\n    for _, lg_conf in logger_cfg.items():\n        if isinstance(lg_conf, DictConfig) and \"_target_\" in lg_conf:\n            log.info(f\"Instantiating logger <{lg_conf._target_}>\")\n            logger.append(hydra.utils.instantiate(lg_conf))\n\n    return logger\n"
  },
  {
    "path": "fish_speech/utils/logger.py",
    "content": "import logging\nfrom typing import Mapping, Optional\n\nfrom lightning_utilities.core.rank_zero import rank_prefixed_message, rank_zero_only\n\n\nclass RankedLogger(logging.LoggerAdapter):\n    \"\"\"A multi-GPU-friendly python command line logger.\"\"\"\n\n    def __init__(\n        self,\n        name: str = __name__,\n        rank_zero_only: bool = True,\n        extra: Optional[Mapping[str, object]] = None,\n    ) -> None:\n        \"\"\"Initializes a multi-GPU-friendly python command line logger that logs on all processes\n        with their rank prefixed in the log message.\n\n        :param name: The name of the logger. Default is ``__name__``.\n        :param rank_zero_only: Whether to force all logs to only occur on the rank zero process. Default is `False`.\n        :param extra: (Optional) A dict-like object which provides contextual information. See `logging.LoggerAdapter`.\n        \"\"\"\n        logger = logging.getLogger(name)\n        super().__init__(logger=logger, extra=extra)\n        self.rank_zero_only = rank_zero_only\n\n    def log(\n        self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs\n    ) -> None:\n        \"\"\"Delegate a log call to the underlying logger, after prefixing its message with the rank\n        of the process it's being logged from. If `'rank'` is provided, then the log will only\n        occur on that rank/process.\n\n        :param level: The level to log at. Look at `logging.__init__.py` for more information.\n        :param msg: The message to log.\n        :param rank: The rank to log at.\n        :param args: Additional args to pass to the underlying logging function.\n        :param kwargs: Any additional keyword args to pass to the underlying logging function.\n        \"\"\"\n        if self.isEnabledFor(level):\n            msg, kwargs = self.process(msg, kwargs)\n            current_rank = getattr(rank_zero_only, \"rank\", None)\n            if current_rank is None:\n                raise RuntimeError(\n                    \"The `rank_zero_only.rank` needs to be set before use\"\n                )\n            msg = rank_prefixed_message(msg, current_rank)\n            if self.rank_zero_only:\n                if current_rank == 0:\n                    self.logger.log(level, msg, *args, **kwargs)\n            else:\n                if rank is None:\n                    self.logger.log(level, msg, *args, **kwargs)\n                elif current_rank == rank:\n                    self.logger.log(level, msg, *args, **kwargs)\n"
  },
  {
    "path": "fish_speech/utils/logging_utils.py",
    "content": "from lightning.pytorch.utilities import rank_zero_only\n\nfrom fish_speech.utils import logger as log\n\n\n@rank_zero_only\ndef log_hyperparameters(object_dict: dict) -> None:\n    \"\"\"Controls which config parts are saved by lightning loggers.\n\n    Additionally saves:\n    - Number of model parameters\n    \"\"\"\n\n    hparams = {}\n\n    cfg = object_dict[\"cfg\"]\n    model = object_dict[\"model\"]\n    trainer = object_dict[\"trainer\"]\n\n    if not trainer.logger:\n        log.warning(\"Logger not found! Skipping hyperparameter logging...\")\n        return\n\n    hparams[\"model\"] = cfg[\"model\"]\n\n    # save number of model parameters\n    hparams[\"model/params/total\"] = sum(p.numel() for p in model.parameters())\n    hparams[\"model/params/trainable\"] = sum(\n        p.numel() for p in model.parameters() if p.requires_grad\n    )\n    hparams[\"model/params/non_trainable\"] = sum(\n        p.numel() for p in model.parameters() if not p.requires_grad\n    )\n\n    hparams[\"data\"] = cfg[\"data\"]\n    hparams[\"trainer\"] = cfg[\"trainer\"]\n\n    hparams[\"callbacks\"] = cfg.get(\"callbacks\")\n    hparams[\"extras\"] = cfg.get(\"extras\")\n\n    hparams[\"task_name\"] = cfg.get(\"task_name\")\n    hparams[\"tags\"] = cfg.get(\"tags\")\n    hparams[\"ckpt_path\"] = cfg.get(\"ckpt_path\")\n    hparams[\"seed\"] = cfg.get(\"seed\")\n\n    # send hparams to all loggers\n    for logger in trainer.loggers:\n        logger.log_hyperparams(hparams)\n"
  },
  {
    "path": "fish_speech/utils/rich_utils.py",
    "content": "from pathlib import Path\nfrom typing import Sequence\n\nimport rich\nimport rich.syntax\nimport rich.tree\nfrom hydra.core.hydra_config import HydraConfig\nfrom lightning.pytorch.utilities import rank_zero_only\nfrom omegaconf import DictConfig, OmegaConf, open_dict\nfrom rich.prompt import Prompt\n\nfrom fish_speech.utils import logger as log\n\n\n@rank_zero_only\ndef print_config_tree(\n    cfg: DictConfig,\n    print_order: Sequence[str] = (\n        \"data\",\n        \"model\",\n        \"callbacks\",\n        \"logger\",\n        \"trainer\",\n        \"paths\",\n        \"extras\",\n    ),\n    resolve: bool = False,\n    save_to_file: bool = False,\n) -> None:\n    \"\"\"Prints content of DictConfig using Rich library and its tree structure.\n\n    Args:\n        cfg (DictConfig): Configuration composed by Hydra.\n        print_order (Sequence[str], optional): Determines in what order config components are printed.\n        resolve (bool, optional): Whether to resolve reference fields of DictConfig.\n        save_to_file (bool, optional): Whether to export config to the hydra output folder.\n    \"\"\"  # noqa: E501\n\n    style = \"dim\"\n    tree = rich.tree.Tree(\"CONFIG\", style=style, guide_style=style)\n\n    queue = []\n\n    # add fields from `print_order` to queue\n    for field in print_order:\n        (\n            queue.append(field)\n            if field in cfg\n            else log.warning(\n                f\"Field '{field}' not found in config. \"\n                + f\"Skipping '{field}' config printing...\"\n            )\n        )\n\n    # add all the other fields to queue (not specified in `print_order`)\n    for field in cfg:\n        if field not in queue:\n            queue.append(field)\n\n    # generate config tree from queue\n    for field in queue:\n        branch = tree.add(field, style=style, guide_style=style)\n\n        config_group = cfg[field]\n        if isinstance(config_group, DictConfig):\n            branch_content = OmegaConf.to_yaml(config_group, resolve=resolve)\n        else:\n            branch_content = str(config_group)\n\n        branch.add(rich.syntax.Syntax(branch_content, \"yaml\"))\n\n    # print config tree\n    rich.print(tree)\n\n    # save config tree to file\n    if save_to_file:\n        with open(Path(cfg.paths.output_dir, \"config_tree.log\"), \"w\") as file:\n            rich.print(tree, file=file)\n\n\n@rank_zero_only\ndef enforce_tags(cfg: DictConfig, save_to_file: bool = False) -> None:\n    \"\"\"Prompts user to input tags from command line if no tags are provided in config.\"\"\"  # noqa: E501\n\n    if not cfg.get(\"tags\"):\n        if \"id\" in HydraConfig().cfg.hydra.job:\n            raise ValueError(\"Specify tags before launching a multirun!\")\n\n        log.warning(\"No tags provided in config. Prompting user to input tags...\")\n        tags = Prompt.ask(\"Enter a list of comma separated tags\", default=\"dev\")\n        tags = [t.strip() for t in tags.split(\",\") if t != \"\"]\n\n        with open_dict(cfg):\n            cfg.tags = tags\n\n        log.info(f\"Tags: {cfg.tags}\")\n\n    if save_to_file:\n        with open(Path(cfg.paths.output_dir, \"tags.log\"), \"w\") as file:\n            rich.print(cfg.tags, file=file)\n"
  },
  {
    "path": "fish_speech/utils/schema.py",
    "content": "import base64\nimport os\nimport queue\nfrom dataclasses import dataclass\nfrom typing import Literal\n\nimport torch\nfrom pydantic import BaseModel, Field, conint, model_validator\nfrom pydantic.functional_validators import SkipValidation\nfrom typing_extensions import Annotated\n\nfrom fish_speech.content_sequence import TextPart, VQPart\n\n\nclass ServeVQPart(BaseModel):\n    type: Literal[\"vq\"] = \"vq\"\n    codes: SkipValidation[list[list[int]]]\n\n\nclass ServeTextPart(BaseModel):\n    type: Literal[\"text\"] = \"text\"\n    text: str\n\n\nclass ServeAudioPart(BaseModel):\n    type: Literal[\"audio\"] = \"audio\"\n    audio: bytes\n\n\nclass ServeRequest(BaseModel):\n    # Raw content sequence dict that we can use with ContentSequence(**content)\n    content: dict\n    max_new_tokens: int = 600\n    top_p: float = 0.7\n    repetition_penalty: float = 1.2\n    temperature: float = 0.7\n    streaming: bool = False\n    num_samples: int = 1\n    early_stop_threshold: float = 1.0\n\n\nclass ServeVQGANEncodeRequest(BaseModel):\n    # The audio here should be in wav, mp3, etc\n    audios: list[bytes]\n\n\nclass ServeVQGANEncodeResponse(BaseModel):\n    tokens: SkipValidation[list[list[list[int]]]]\n\n\nclass ServeVQGANDecodeRequest(BaseModel):\n    tokens: SkipValidation[list[list[list[int]]]]\n\n\nclass ServeVQGANDecodeResponse(BaseModel):\n    # The audio here should be in PCM float16 format\n    audios: list[bytes]\n\n\nclass ServeReferenceAudio(BaseModel):\n    audio: bytes\n    text: str\n\n    @model_validator(mode=\"before\")\n    def decode_audio(cls, values):\n        audio = values.get(\"audio\")\n        if (\n            isinstance(audio, str) and len(audio) > 255\n        ):  # Check if audio is a string (Base64)\n            try:\n                values[\"audio\"] = base64.b64decode(audio)\n            except Exception:\n                # If the audio is not a valid base64 string, we will just ignore it and let the server handle it\n                pass\n        return values\n\n    def __repr__(self) -> str:\n        return f\"ServeReferenceAudio(text={self.text!r}, audio_size={len(self.audio)})\"\n\n\nclass ServeTTSRequest(BaseModel):\n    text: str\n    chunk_length: Annotated[int, conint(ge=100, le=1000, strict=True)] = 200\n    # Audio format\n    format: Literal[\"wav\", \"pcm\", \"mp3\", \"opus\"] = \"wav\"\n    # Latency mode (used by api.fish.audio; \"normal\" or \"balanced\")\n    latency: Literal[\"normal\", \"balanced\"] = \"normal\"\n    # References audios for in-context learning\n    references: list[ServeReferenceAudio] = []\n    # Reference id\n    # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/\n    # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1\n    reference_id: str | None = None\n    seed: int | None = None\n    use_memory_cache: Literal[\"on\", \"off\"] = \"off\"\n    # Normalize text for en & zh, this increase stability for numbers\n    normalize: bool = True\n    # not usually used below\n    streaming: bool = False\n    max_new_tokens: int = 1024\n    top_p: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.8\n    repetition_penalty: Annotated[float, Field(ge=0.9, le=2.0, strict=True)] = 1.1\n    temperature: Annotated[float, Field(ge=0.1, le=1.0, strict=True)] = 0.8\n\n    class Config:\n        # Allow arbitrary types for pytorch related types\n        arbitrary_types_allowed = True\n\n\nclass AddReferenceRequest(BaseModel):\n    id: str = Field(..., min_length=1, max_length=255, pattern=r\"^[a-zA-Z0-9\\-_ ]+$\")\n    audio: bytes\n    text: str = Field(..., min_length=1)\n\n\nclass AddReferenceResponse(BaseModel):\n    success: bool\n    message: str\n    reference_id: str\n\n\nclass ListReferencesResponse(BaseModel):\n    success: bool\n    reference_ids: list[str]\n    message: str = \"Success\"\n\n\nclass DeleteReferenceResponse(BaseModel):\n    success: bool\n    message: str\n    reference_id: str\n\n\nclass UpdateReferenceResponse(BaseModel):\n    success: bool\n    message: str\n    old_reference_id: str\n    new_reference_id: str\n"
  },
  {
    "path": "fish_speech/utils/spectrogram.py",
    "content": "import torch\nimport torchaudio.functional as F\nfrom torch import Tensor, nn\nfrom torchaudio.transforms import MelScale\n\n\nclass LinearSpectrogram(nn.Module):\n    def __init__(\n        self,\n        n_fft=2048,\n        win_length=2048,\n        hop_length=512,\n        center=False,\n        mode=\"pow2_sqrt\",\n    ):\n        super().__init__()\n\n        self.n_fft = n_fft\n        self.win_length = win_length\n        self.hop_length = hop_length\n        self.center = center\n        self.mode = mode\n        self.return_complex = True\n\n        self.register_buffer(\"window\", torch.hann_window(win_length), persistent=False)\n\n    def forward(self, y: Tensor) -> Tensor:\n        if y.ndim == 3:\n            y = y.squeeze(1)\n\n        y = torch.nn.functional.pad(\n            y.unsqueeze(1),\n            (\n                (self.win_length - self.hop_length) // 2,\n                (self.win_length - self.hop_length + 1) // 2,\n            ),\n            mode=\"reflect\",\n        ).squeeze(1)\n\n        spec = torch.stft(\n            y,\n            self.n_fft,\n            hop_length=self.hop_length,\n            win_length=self.win_length,\n            window=self.window,\n            center=self.center,\n            pad_mode=\"reflect\",\n            normalized=False,\n            onesided=True,\n            return_complex=self.return_complex,\n        )\n\n        if self.return_complex:\n            spec = torch.view_as_real(spec)\n\n        if self.mode == \"pow2_sqrt\":\n            spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)\n\n        return spec\n\n\nclass LogMelSpectrogram(nn.Module):\n    def __init__(\n        self,\n        sample_rate=44100,\n        n_fft=2048,\n        win_length=2048,\n        hop_length=512,\n        n_mels=128,\n        center=False,\n        f_min=0.0,\n        f_max=None,\n    ):\n        super().__init__()\n\n        self.sample_rate = sample_rate\n        self.n_fft = n_fft\n        self.win_length = win_length\n        self.hop_length = hop_length\n        self.center = center\n        self.n_mels = n_mels\n        self.f_min = f_min\n        self.f_max = f_max or float(sample_rate // 2)\n\n        self.spectrogram = LinearSpectrogram(n_fft, win_length, hop_length, center)\n\n        fb = F.melscale_fbanks(\n            n_freqs=self.n_fft // 2 + 1,\n            f_min=self.f_min,\n            f_max=self.f_max,\n            n_mels=self.n_mels,\n            sample_rate=self.sample_rate,\n            norm=\"slaney\",\n            mel_scale=\"slaney\",\n        )\n        self.register_buffer(\n            \"fb\",\n            fb,\n            persistent=False,\n        )\n\n    def compress(self, x: Tensor) -> Tensor:\n        return torch.log(torch.clamp(x, min=1e-5))\n\n    def decompress(self, x: Tensor) -> Tensor:\n        return torch.exp(x)\n\n    def apply_mel_scale(self, x: Tensor) -> Tensor:\n        return torch.matmul(x.transpose(-1, -2), self.fb).transpose(-1, -2)\n\n    def forward(\n        self, x: Tensor, return_linear: bool = False, sample_rate: int = None\n    ) -> Tensor:\n        if sample_rate is not None and sample_rate != self.sample_rate:\n            x = F.resample(x, orig_freq=sample_rate, new_freq=self.sample_rate)\n\n        linear = self.spectrogram(x)\n        x = self.apply_mel_scale(linear)\n        x = self.compress(x)\n\n        if return_linear:\n            return x, self.compress(linear)\n\n        return x\n"
  },
  {
    "path": "fish_speech/utils/utils.py",
    "content": "import random\nimport warnings\nfrom importlib.util import find_spec\nfrom typing import Callable\n\nimport numpy as np\nimport torch\nfrom omegaconf import DictConfig\n\nfrom .logger import RankedLogger\nfrom .rich_utils import enforce_tags, print_config_tree\n\nlog = RankedLogger(__name__, rank_zero_only=True)\n\n\ndef extras(cfg: DictConfig) -> None:\n    \"\"\"Applies optional utilities before the task is started.\n\n    Utilities:\n    - Ignoring python warnings\n    - Setting tags from command line\n    - Rich config printing\n    \"\"\"\n\n    # return if no `extras` config\n    if not cfg.get(\"extras\"):\n        log.warning(\"Extras config not found! <cfg.extras=null>\")\n        return\n\n    # disable python warnings\n    if cfg.extras.get(\"ignore_warnings\"):\n        log.info(\"Disabling python warnings! <cfg.extras.ignore_warnings=True>\")\n        warnings.filterwarnings(\"ignore\")\n\n    # prompt user to input tags from command line if none are provided in the config\n    if cfg.extras.get(\"enforce_tags\"):\n        log.info(\"Enforcing tags! <cfg.extras.enforce_tags=True>\")\n        enforce_tags(cfg, save_to_file=True)\n\n    # pretty print config tree using Rich library\n    if cfg.extras.get(\"print_config\"):\n        log.info(\"Printing config tree with Rich! <cfg.extras.print_config=True>\")\n        print_config_tree(cfg, resolve=True, save_to_file=True)\n\n\ndef task_wrapper(task_func: Callable) -> Callable:\n    \"\"\"Optional decorator that controls the failure behavior when executing the task function.\n\n    This wrapper can be used to:\n    - make sure loggers are closed even if the task function raises an exception (prevents multirun failure)\n    - save the exception to a `.log` file\n    - mark the run as failed with a dedicated file in the `logs/` folder (so we can find and rerun it later)\n    - etc. (adjust depending on your needs)\n\n    Example:\n    ```\n    @utils.task_wrapper\n    def train(cfg: DictConfig) -> Tuple[dict, dict]:\n\n        ...\n\n        return metric_dict, object_dict\n    ```\n    \"\"\"  # noqa: E501\n\n    def wrap(cfg: DictConfig):\n        # execute the task\n        try:\n            metric_dict, object_dict = task_func(cfg=cfg)\n\n        # things to do if exception occurs\n        except Exception as ex:\n            # save exception to `.log` file\n            log.exception(\"\")\n\n            # some hyperparameter combinations might be invalid or\n            # cause out-of-memory errors so when using hparam search\n            # plugins like Optuna, you might want to disable\n            # raising the below exception to avoid multirun failure\n            raise ex\n\n        # things to always do after either success or exception\n        finally:\n            # display output dir path in terminal\n            log.info(f\"Output dir: {cfg.paths.run_dir}\")\n\n            # always close wandb run (even if exception occurs so multirun won't fail)\n            if find_spec(\"wandb\"):  # check if wandb is installed\n                import wandb\n\n                if wandb.run:\n                    log.info(\"Closing wandb!\")\n                    wandb.finish()\n\n        return metric_dict, object_dict\n\n    return wrap\n\n\ndef get_metric_value(metric_dict: dict, metric_name: str) -> float:\n    \"\"\"Safely retrieves value of the metric logged in LightningModule.\"\"\"\n\n    if not metric_name:\n        log.info(\"Metric name is None! Skipping metric value retrieval...\")\n        return None\n\n    if metric_name not in metric_dict:\n        raise Exception(\n            f\"Metric value not found! <metric_name={metric_name}>\\n\"\n            \"Make sure metric name logged in LightningModule is correct!\\n\"\n            \"Make sure `optimized_metric` name in `hparams_search` config is correct!\"\n        )\n\n    metric_value = metric_dict[metric_name].item()\n    log.info(f\"Retrieved metric value! <{metric_name}={metric_value}>\")\n\n    return metric_value\n\n\ndef set_seed(seed: int):\n    if seed < 0:\n        seed = -seed\n    if seed > (1 << 31):\n        seed = 1 << 31\n\n    random.seed(seed)\n    np.random.seed(seed)\n    torch.manual_seed(seed)\n\n    if torch.cuda.is_available():\n        torch.cuda.manual_seed(seed)\n        torch.cuda.manual_seed_all(seed)\n\n    if torch.backends.cudnn.is_available():\n        torch.backends.cudnn.deterministic = True\n        torch.backends.cudnn.benchmark = False\n"
  },
  {
    "path": "inference.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Fish Speech\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### For Windows User / win用户\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"vscode\": {\n     \"languageId\": \"bat\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!chcp 65001\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### For Linux User / Linux 用户\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import locale\\n\",\n    \"locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Prepare Model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# For Chinese users, you probably want to use mirror to accelerate downloading\\n\",\n    \"# !set HF_ENDPOINT=https://hf-mirror.com\\n\",\n    \"# !export HF_ENDPOINT=https://hf-mirror.com \\n\",\n    \"\\n\",\n    \"!hf download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini/\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## WebUI Inference\\n\",\n    \"\\n\",\n    \"> You can use --compile to fuse CUDA kernels for faster inference (10x).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"vscode\": {\n     \"languageId\": \"shellscript\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!python tools/run_webui.py \\\\\\n\",\n    \"    --llama-checkpoint-path checkpoints/openaudio-s1-mini \\\\\\n\",\n    \"    --decoder-checkpoint-path checkpoints/openaudio-s1-mini/codec.pth \\\\\\n\",\n    \"    # --compile\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## Break-down CLI Inference\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 1. Encode reference audio: / 从语音生成 prompt: \\n\",\n    \"\\n\",\n    \"You should get a `fake.npy` file.\\n\",\n    \"\\n\",\n    \"你应该能得到一个 `fake.npy` 文件.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"vscode\": {\n     \"languageId\": \"shellscript\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"## Enter the path to the audio file here\\n\",\n    \"src_audio = r\\\"D:\\\\PythonProject\\\\vo_hutao_draw_appear.wav\\\"\\n\",\n    \"\\n\",\n    \"!python fish_speech/models/dac/inference.py \\\\\\n\",\n    \"    -i {src_audio} \\\\\\n\",\n    \"    --checkpoint-path \\\"checkpoints/openaudio-s1-mini/codec.pth\\\"\\n\",\n    \"\\n\",\n    \"from IPython.display import Audio, display\\n\",\n    \"audio = Audio(filename=\\\"fake.wav\\\")\\n\",\n    \"display(audio)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 2. Generate semantic tokens from text: / 从文本生成语义 token:\\n\",\n    \"\\n\",\n    \"> This command will create a codes_N file in the working directory, where N is an integer starting from 0.\\n\",\n    \"\\n\",\n    \"> You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~300 tokens/second).\\n\",\n    \"\\n\",\n    \"> 该命令会在工作目录下创建 codes_N 文件, 其中 N 是从 0 开始的整数.\\n\",\n    \"\\n\",\n    \"> 您可以使用 `--compile` 来融合 cuda 内核以实现更快的推理 (~30 tokens/秒 -> ~300 tokens/秒)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"vscode\": {\n     \"languageId\": \"shellscript\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!python fish_speech/models/text2semantic/inference.py \\\\\\n\",\n    \"    --text \\\"hello world\\\" \\\\\\n\",\n    \"    --prompt-text \\\"The text corresponding to reference audio\\\" \\\\\\n\",\n    \"    --prompt-tokens \\\"fake.npy\\\" \\\\\\n\",\n    \"    --checkpoint-path \\\"checkpoints/openaudio-s1-mini\\\" \\\\\\n\",\n    \"    --num-samples 2\\n\",\n    \"    # --compile\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### 3. Generate speech from semantic tokens: / 从语义 token 生成人声:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"vscode\": {\n     \"languageId\": \"shellscript\"\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!python fish_speech/models/dac/inference.py \\\\\\n\",\n    \"    -i \\\"codes_0.npy\\\" \\\\\\n\",\n    \"    --checkpoint-path \\\"checkpoints/openaudio-s1-mini/codec.pth\\\"\\n\",\n    \"\\n\",\n    \"from IPython.display import Audio, display\\n\",\n    \"audio = Audio(filename=\\\"fake.wav\\\")\\n\",\n    \"display(audio)\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.10.14\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "mkdocs.yml",
    "content": "site_name: Fish Audio\nsite_description: Targeting SOTA TTS solutions.\nsite_url: https://speech.fish.audio\n\n# Repository\nrepo_name: fishaudio/fish-speech\nrepo_url: https://github.com/fishaudio/fish-speech\nedit_uri: blob/main/docs\n\n# Copyright\ncopyright: Copyright &copy; 2023-2025 by Fish Audio\n\ntheme:\n  name: material\n  favicon: assets/logo.svg\n  language: en\n  features:\n    - content.action.edit\n    - content.action.view\n    - navigation.tracking\n    - navigation.footer\n    # - navigation.tabs\n    - search\n    - search.suggest\n    - search.highlight\n    - search.share\n    - content.code.copy\n  logo: assets/logo.svg\n\n  palette:\n    # Palette toggle for automatic mode\n    - media: \"(prefers-color-scheme)\"\n      toggle:\n        icon: material/brightness-auto\n        name: Switch to light mode\n\n    # Palette toggle for light mode\n    - media: \"(prefers-color-scheme: light)\"\n      scheme: default\n      toggle:\n        icon: material/brightness-7\n        name: Switch to dark mode\n      primary: black\n      font:\n        code: Roboto Mono\n\n    # Palette toggle for dark mode\n    - media: \"(prefers-color-scheme: dark)\"\n      scheme: slate\n      toggle:\n        icon: material/brightness-4\n        name: Switch to light mode\n      primary: black\n      font:\n        code: Roboto Mono\n\nnav:\n  - Introduction: en/index.md\n  - Installation: en/install.md\n  - Finetune: en/finetune.md\n  - Inference: en/inference.md\n  - Server: en/server.md\n  - Samples: en/samples.md\n\n# Plugins\nplugins:\n  - search:\n      separator: '[\\s\\-,:!=\\[\\]()\"`/]+|\\.(?!\\d)|&[lg]t;|(?!\\b)(?=[A-Z][a-z])'\n      lang:\n        - en\n        - zh\n        - ja\n        - pt\n        - ko\n        - ar\n  - i18n:\n      docs_structure: folder\n      languages:\n        - locale: en\n          name: English\n          default: true\n          build: true\n        - locale: zh\n          name: 简体中文\n          build: true\n          nav:\n            - 介绍: zh/index.md\n            - 安装: zh/install.md\n            - 微调: zh/finetune.md\n            - 推理: zh/inference.md\n            - 示例: zh/samples.md\n        - locale: ja\n          name: 日本語\n          build: true\n          nav:\n            - はじめに: ja/index.md\n            - インストール: ja/install.md\n            - ファインチューニング: ja/finetune.md\n            - 推論: ja/inference.md\n            - サンプル: ja/samples.md\n        - locale: pt\n          name: Português (Brasil)\n          build: true\n          nav:\n            - Introdução: pt/index.md\n            - Instalação: pt/install.md\n            - Ajuste Fino: pt/finetune.md\n            - Inferência: pt/inference.md\n            - Amostras: pt/samples.md\n        - locale: ko\n          name: 한국어\n          build: true\n          nav:\n            - 소개: ko/index.md\n            - 설치: ko/install.md\n            - 파인튜닝: ko/finetune.md\n            - 추론: ko/inference.md\n            - 샘플: ko/samples.md\n        - locale: ar\n          name: العربية\n          build: true\n          nav:\n            - مقدمة: ar/index.md\n            - التثبيت: ar/install.md\n            - الضبط الدقيق: ar/finetune.md\n            - الاستنتاج: ar/inference.md\n            - العينات: ar/samples.md\n\nmarkdown_extensions:\n  - pymdownx.highlight:\n      anchor_linenums: true\n      line_spans: __span\n      pygments_lang_class: true\n  - pymdownx.inlinehilite\n  - pymdownx.snippets\n  - pymdownx.superfences\n  - admonition\n  - pymdownx.details\n  - pymdownx.superfences\n  - attr_list\n  - md_in_html\n  - pymdownx.superfences\n\nextra_css:\n  - stylesheets/extra.css\n\nextra:\n  social:\n    - icon: fontawesome/brands/discord\n      link: https://discord.gg/Es5qTB9BcN\n    - icon: fontawesome/brands/docker\n      link: https://hub.docker.com/r/fishaudio/fish-speech\n    - icon: fontawesome/brands/qq\n      link: http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093\n  homepage: https://speech.fish.audio\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"fish-speech\"\nversion = \"2.0.0\"\nauthors = [\n    {name = \"Fish Audio\", email = \"oss@fish.audio\"},\n]\ndescription = \"Fish Speech\"\nreadme = \"README.md\"\nrequires-python = \">=3.10\"\nkeywords = [\"TTS\", \"Speech\"]\nlicense = {text = \"Fish Audio Research License\"}\nclassifiers = [\n    \"Programming Language :: Python :: 3\",\n]\ndependencies = [\n    \"numpy\",\n    \"torch==2.8.0\",\n    \"torchaudio==2.8.0\",\n    \"transformers<=4.57.3\",\n    \"datasets==2.18.0\",\n    \"lightning>=2.1.0\",\n    \"hydra-core>=1.3.2\",\n    \"tensorboard>=2.14.1\",\n    \"natsort>=8.4.0\",\n    \"einops>=0.7.0\",\n    \"librosa>=0.10.1\",\n    \"rich>=13.5.3\",\n    \"gradio>5.0.0\",\n    \"wandb>=0.15.11\",\n    \"grpcio>=1.58.0\",\n    \"kui>=1.6.0\",\n    \"uvicorn>=0.30.0\",\n    \"loguru>=0.6.0\",\n    \"loralib>=0.1.2\",\n    \"pyrootutils>=1.0.4\",\n    \"resampy>=0.4.3\",\n    \"einx[torch]==0.2.2\",\n    \"zstandard>=0.22.0\",\n    \"pydub\",\n    \"pyaudio\",\n    \"modelscope==1.17.1\",\n    \"opencc-python-reimplemented==0.1.7\",\n    \"silero-vad\",\n    \"ormsgpack\",\n    \"tiktoken>=0.8.0\",\n    \"pydantic==2.9.2\",\n    \"cachetools\",\n    \"descript-audio-codec\",\n    \"descript-audiotools\",\n    \"safetensors\"\n]\n\n[project.optional-dependencies]\nstable = [\n    \"torch==2.8.0\",\n    \"torchaudio\",\n]\ncpu = [\n  \"torch==2.8.0\",\n  \"torchaudio\",\n]\ncu126 = [\n  \"torch==2.8.0\",\n  \"torchaudio\",\n]\ncu128 = [\n  \"torch==2.8.0\",\n  \"torchaudio\",\n]\ncu129 = [\n  \"torch==2.8.0\",\n  \"torchaudio\",\n]\n\n[tool.uv]\nconflicts = [\n  [\n    { extra = \"cpu\" },\n    { extra = \"cu126\" },\n    { extra = \"cu128\" },\n    { extra = \"cu129\" },\n  ],\n]\n\n[tool.uv.sources]\ntorch = [\n  { index = \"pytorch-cpu\", extra = \"cpu\" },\n  { index = \"pytorch-cu126\", extra = \"cu126\" },\n  { index = \"pytorch-cu128\", extra = \"cu128\" },\n  { index = \"pytorch-cu129\", extra = \"cu129\" },\n]\ntorchaudio = [\n  { index = \"pytorch-cpu\", extra = \"cpu\" },\n  { index = \"pytorch-cu126\", extra = \"cu126\" },\n  { index = \"pytorch-cu128\", extra = \"cu128\" },\n  { index = \"pytorch-cu129\", extra = \"cu129\" },\n]\n\n[[tool.uv.index]]\nname = \"pytorch-cpu\"\nurl = \"https://download.pytorch.org/whl/cpu\"\nexplicit = true\n\n[[tool.uv.index]]\nname = \"pytorch-cu126\"\nurl = \"https://download.pytorch.org/whl/cu126\"\nexplicit = true\n\n[[tool.uv.index]]\nname = \"pytorch-cu128\"\nurl = \"https://download.pytorch.org/whl/cu128\"\nexplicit = true\n\n[[tool.uv.index]]\nname = \"pytorch-cu129\"\nurl = \"https://download.pytorch.org/whl/cu129\"\nexplicit = true\n\n[build-system]\nrequires = [\"setuptools\", \"setuptools-scm\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[tool.setuptools]\npackages = [\"fish_speech\", \"tools\"]\n\n[tool.setuptools_scm]\n"
  },
  {
    "path": "pyrightconfig.json",
    "content": "{\n    \"exclude\": [\n        \"data\",\n        \"filelists\"\n    ]\n}\n"
  },
  {
    "path": "tools/api_client.py",
    "content": "import argparse\nimport base64\nimport time\nimport wave\n\nimport ormsgpack\nimport pyaudio\nimport requests\nfrom pydub import AudioSegment\nfrom pydub.playback import play\n\nfrom fish_speech.utils.file import audio_to_bytes, read_ref_text\nfrom fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=\"Send a WAV file and text to a server and receive synthesized audio.\",\n        formatter_class=argparse.RawTextHelpFormatter,\n    )\n\n    parser.add_argument(\n        \"--url\",\n        \"-u\",\n        type=str,\n        default=\"http://127.0.0.1:8080/v1/tts\",\n        help=\"URL of the server\",\n    )\n    parser.add_argument(\n        \"--text\", \"-t\", type=str, required=True, help=\"Text to be synthesized\"\n    )\n    parser.add_argument(\n        \"--reference_id\",\n        \"-id\",\n        type=str,\n        default=None,\n        help=\"ID of the reference model to be used for the speech\\n(Local: name of folder containing audios and files)\",\n    )\n    parser.add_argument(\n        \"--reference_audio\",\n        \"-ra\",\n        type=str,\n        nargs=\"+\",\n        default=None,\n        help=\"Path to the audio file\",\n    )\n    parser.add_argument(\n        \"--reference_text\",\n        \"-rt\",\n        type=str,\n        nargs=\"+\",\n        default=None,\n        help=\"Reference text for voice synthesis\",\n    )\n    parser.add_argument(\n        \"--output\",\n        \"-o\",\n        type=str,\n        default=\"generated_audio\",\n        help=\"Output audio file name\",\n    )\n    parser.add_argument(\n        \"--play\",\n        action=argparse.BooleanOptionalAction,\n        default=True,\n        help=\"Whether to play audio after receiving data\",\n    )\n    parser.add_argument(\n        \"--format\", type=str, choices=[\"wav\", \"pcm\", \"mp3\", \"opus\"], default=\"wav\"\n    )\n    parser.add_argument(\n        \"--latency\",\n        type=str,\n        default=\"normal\",\n        choices=[\"normal\", \"balanced\"],\n        help=\"Used in api.fish.audio/v1/tts\",\n    )\n    parser.add_argument(\n        \"--max_new_tokens\",\n        type=int,\n        default=1024,\n        help=\"Maximum new tokens to generate. \\n0 means no limit.\",\n    )\n    parser.add_argument(\n        \"--chunk_length\", type=int, default=300, help=\"Chunk length for synthesis\"\n    )\n    parser.add_argument(\n        \"--top_p\", type=float, default=0.8, help=\"Top-p sampling for synthesis\"\n    )\n    parser.add_argument(\n        \"--repetition_penalty\",\n        type=float,\n        default=1.1,\n        help=\"Repetition penalty for synthesis\",\n    )\n    parser.add_argument(\n        \"--temperature\", type=float, default=0.8, help=\"Temperature for sampling\"\n    )\n\n    # parser.add_argument(\"--streaming\", type=bool, default=False, help=\"Enable streaming response\")\n    parser.add_argument(\n        \"--streaming\", action=\"store_true\", help=\"Enable streaming response\"\n    )\n    parser.add_argument(\n        \"--channels\", type=int, default=1, help=\"Number of audio channels\"\n    )\n    parser.add_argument(\"--rate\", type=int, default=44100, help=\"Sample rate for audio\")\n    parser.add_argument(\n        \"--use_memory_cache\",\n        type=str,\n        default=\"off\",\n        choices=[\"on\", \"off\"],\n        help=\"Cache encoded references codes in memory.\\n\",\n    )\n    parser.add_argument(\n        \"--seed\",\n        type=int,\n        default=None,\n        help=\"`None` means randomized inference, otherwise deterministic.\\nIt can't be used for fixing a timbre.\",\n    )\n    parser.add_argument(\n        \"--api_key\",\n        type=str,\n        default=\"YOUR_API_KEY\",\n        help=\"API key for authentication\",\n    )\n\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    idstr: str | None = args.reference_id\n    # priority: ref_id > [{text, audio},...]\n    if idstr is None:\n        ref_audios = args.reference_audio\n        ref_texts = args.reference_text\n        if ref_audios is None:\n            byte_audios = []\n        else:\n            byte_audios = [audio_to_bytes(ref_audio) for ref_audio in ref_audios]\n        if ref_texts is None:\n            ref_texts = []\n        else:\n            ref_texts = [read_ref_text(ref_text) for ref_text in ref_texts]\n    else:\n        byte_audios = []\n        ref_texts = []\n        pass  # in api.py\n\n    data = {\n        \"text\": args.text,\n        \"references\": [\n            ServeReferenceAudio(\n                audio=ref_audio if ref_audio is not None else b\"\", text=ref_text\n            )\n            for ref_text, ref_audio in zip(ref_texts, byte_audios)\n        ],\n        \"reference_id\": idstr,\n        \"format\": args.format,\n        \"latency\": args.latency,\n        \"max_new_tokens\": args.max_new_tokens,\n        \"chunk_length\": args.chunk_length,\n        \"top_p\": args.top_p,\n        \"repetition_penalty\": args.repetition_penalty,\n        \"temperature\": args.temperature,\n        \"streaming\": args.streaming,\n        \"use_memory_cache\": args.use_memory_cache,\n        \"seed\": args.seed,\n    }\n\n    pydantic_data = ServeTTSRequest(**data)\n\n    print(\"Sending request\")\n    start_time = time.time()\n    response = requests.post(\n        args.url,\n        params={\"format\": \"msgpack\"},\n        data=ormsgpack.packb(pydantic_data, option=ormsgpack.OPT_SERIALIZE_PYDANTIC),\n        stream=args.streaming,\n        headers={\n            \"authorization\": f\"Bearer {args.api_key}\",\n            \"content-type\": \"application/msgpack\",\n        },\n    )\n    end_time = time.time()\n    print(f\"Request took {end_time - start_time} seconds\")\n\n    if response.status_code == 200:\n        if args.streaming:\n            p = pyaudio.PyAudio()\n            audio_format = pyaudio.paInt16  # Assuming 16-bit PCM format\n            stream = p.open(\n                format=audio_format, channels=args.channels, rate=args.rate, output=True\n            )\n\n            wf = wave.open(f\"{args.output}.wav\", \"wb\")\n            wf.setnchannels(args.channels)\n            wf.setsampwidth(p.get_sample_size(audio_format))\n            wf.setframerate(args.rate)\n\n            stream_stopped_flag = False\n\n            try:\n                for chunk in response.iter_content(chunk_size=1024):\n                    if chunk:\n                        stream.write(chunk)\n                        wf.writeframesraw(chunk)\n                    else:\n                        if not stream_stopped_flag:\n                            stream.stop_stream()\n                            stream_stopped_flag = True\n            finally:\n                stream.close()\n                p.terminate()\n                wf.close()\n        else:\n            audio_content = response.content\n            audio_path = f\"{args.output}.{args.format}\"\n            with open(audio_path, \"wb\") as audio_file:\n                audio_file.write(audio_content)\n\n            audio = AudioSegment.from_file(audio_path, format=args.format)\n            if args.play:\n                play(audio)\n            print(f\"Audio has been saved to '{audio_path}'.\")\n    else:\n        print(f\"Request failed with status code {response.status_code}\")\n        print(response.json())\n"
  },
  {
    "path": "tools/api_server.py",
    "content": "import re\nfrom threading import Lock\n\nimport pyrootutils\nimport uvicorn\nfrom kui.asgi import (\n    Depends,\n    FactoryClass,\n    HTTPException,\n    HttpRoute,\n    Kui,\n    OpenAPI,\n    Routes,\n)\nfrom kui.cors import CORSConfig\nfrom kui.openapi.specification import Info\nfrom kui.security import bearer_auth\nfrom loguru import logger\nfrom typing_extensions import Annotated\n\npyrootutils.setup_root(__file__, indicator=\".project-root\", pythonpath=True)\n\nfrom tools.server.api_utils import MsgPackRequest, parse_args\nfrom tools.server.exception_handler import ExceptionHandler\nfrom tools.server.model_manager import ModelManager\nfrom tools.server.views import routes\n\n\nclass API(ExceptionHandler):\n    def __init__(self):\n        self.args = parse_args()\n\n        def api_auth(endpoint):\n            async def verify(token: Annotated[str, Depends(bearer_auth)]):\n                if token != self.args.api_key:\n                    raise HTTPException(401, None, \"Invalid token\")\n                return await endpoint()\n\n            async def passthrough():\n                return await endpoint()\n\n            if self.args.api_key is not None:\n                return verify\n            else:\n                return passthrough\n\n        self.routes = Routes(\n            routes,  # keep existing routes\n            http_middlewares=[api_auth],  # apply api_auth middleware\n        )\n\n        # OpenAPIの設定\n        self.openapi = OpenAPI(\n            Info(\n                {\n                    \"title\": \"Fish Speech API\",\n                    \"version\": \"1.5.0\",\n                }\n            ),\n        ).routes\n\n        # Initialize the app\n        self.app = Kui(\n            routes=self.routes + self.openapi[1:],  # Remove the default route\n            exception_handlers={\n                HTTPException: self.http_exception_handler,\n                Exception: self.other_exception_handler,\n            },\n            factory_class=FactoryClass(http=MsgPackRequest),\n            cors_config=CORSConfig(),\n        )\n\n        # Add the state variables\n        self.app.state.lock = Lock()\n        self.app.state.device = self.args.device\n        self.app.state.max_text_length = self.args.max_text_length\n\n        # Associate the app with the model manager\n        self.app.on_startup(self.initialize_app)\n\n    async def initialize_app(self, app: Kui):\n        # Make the ModelManager available to the views\n        app.state.model_manager = ModelManager(\n            mode=self.args.mode,\n            device=self.args.device,\n            half=self.args.half,\n            compile=self.args.compile,\n            llama_checkpoint_path=self.args.llama_checkpoint_path,\n            decoder_checkpoint_path=self.args.decoder_checkpoint_path,\n            decoder_config_name=self.args.decoder_config_name,\n        )\n\n        logger.info(f\"Startup done, listening server at http://{self.args.listen}\")\n\n\n# Each worker process created by Uvicorn has its own memory space,\n# meaning that models and variables are not shared between processes.\n# Therefore, any variables (like `llama_queue` or `decoder_model`)\n# will not be shared across workers.\n\n# Multi-threading for deep learning can cause issues, such as inconsistent\n# outputs if multiple threads access the same buffers simultaneously.\n# Instead, it's better to use multiprocessing or independent models per thread.\n\nif __name__ == \"__main__\":\n    api = API()\n\n    # IPv6 address format is [xxxx:xxxx::xxxx]:port\n    match = re.search(r\"\\[([^\\]]+)\\]:(\\d+)$\", api.args.listen)\n    if match:\n        host, port = match.groups()  # IPv6\n    else:\n        host, port = api.args.listen.split(\":\")  # IPv4\n\n    uvicorn.run(\n        api.app,\n        host=host,\n        port=int(port),\n        workers=api.args.workers,\n        log_level=\"info\",\n    )\n"
  },
  {
    "path": "tools/llama/build_dataset.py",
    "content": "import itertools\nimport os\nimport re\nfrom collections import defaultdict\nfrom functools import partial\nfrom multiprocessing import Pool\nfrom pathlib import Path\n\nimport click\nimport numpy as np\nfrom loguru import logger\nfrom tqdm import tqdm\n\nfrom fish_speech.datasets.protos.text_data_pb2 import Semantics, Sentence, TextData\nfrom fish_speech.datasets.protos.text_data_stream import pack_pb_stream\nfrom fish_speech.utils.file import load_filelist\n\n# To avoid CPU overload\nos.environ[\"MKL_NUM_THREADS\"] = \"1\"\nos.environ[\"OMP_NUM_THREADS\"] = \"1\"\n\n\ndef task_generator_folder(root: Path, text_extension: str):\n    files = list(tqdm(Path(root).rglob(\"*.npy\"), desc=f\"Loading {root}\"))\n    files = sorted(files)\n\n    grouped_files = defaultdict(list)\n    for file in tqdm(files, desc=f\"Grouping {root}\"):\n        p = str(file.parent)\n        speaker = file.parent.name\n\n        try:\n            if isinstance(text_extension, str):\n                texts = [file.with_suffix(text_extension).read_text(encoding=\"utf-8\")]\n            else:\n                texts = [\n                    file.with_suffix(ext).read_text(encoding=\"utf-8\")\n                    for ext in text_extension\n                ]\n        except Exception as e:\n            logger.error(f\"Failed to read text {file}: {e}\")\n            continue\n\n        grouped_files[p].append((speaker, file, texts))\n\n    logger.info(\n        f\"Found {len(grouped_files)} groups in {root}, {list(grouped_files.keys())[:5]}...\"\n    )\n\n    for i in grouped_files.values():\n        subset = [(f, t) for _, f, t in i]\n        yield i[0][0], subset, \"folder\"\n\n\ndef task_generator_filelist(filelist):\n    grouped_files = defaultdict(list)\n    for filename, speaker, _, text in load_filelist(filelist):\n        grouped_files[speaker].append((Path(filename), [text]))\n\n    logger.info(f\"Found {len(grouped_files)} groups in {filelist}\")\n    for speaker, values in grouped_files.items():\n        yield speaker, values, \"filelist\"\n\n\ndef run_task(task):\n    name, subset, source = task\n\n    # Parse the files\n    sentences = []\n    for file, texts in subset:\n        np_file = file.with_suffix(\".npy\")\n        if np_file.exists() is False:\n            logger.warning(f\"Can't find {np_file}\")\n            continue\n\n        new_texts = []\n\n        for text in texts:\n            # Simple cleaning: replace { xxx } and < xxx > with space\n            text = re.sub(r\"\\{.*?\\}\", \" \", text)\n            text = re.sub(r\"<.*?>\", \" \", text)\n            text = re.sub(r\"\\s+\", \" \", text)\n            new_texts.append(text)\n\n        try:\n            semantics = np.load(np_file)\n        except Exception as e:\n            logger.error(f\"Failed to parse {file}: {e}\")\n            continue\n\n        if isinstance(semantics, np.ndarray):\n            semantics = semantics.tolist()\n\n        sentences.append(\n            Sentence(\n                texts=new_texts,\n                semantics=[Semantics(values=s) for s in semantics],\n            )\n        )\n\n    # Pack the sentences\n    return pack_pb_stream(\n        TextData(\n            source=source,\n            name=name,\n            sentences=sentences,\n        )\n    )\n\n\n@click.command()\n@click.option(\n    \"--input\",\n    type=click.Path(path_type=Path),\n    required=True,\n    help=\"A folder containing the dataset or a filelist\",\n    multiple=True,\n)\n@click.option(\n    \"--output\", type=click.Path(path_type=Path), default=\"data/quantized-dataset-ft\"\n)\n@click.option(\"--num-workers\", type=int, default=16)\n@click.option(\"--text-extension\", type=str, default=[\".txt\"], multiple=True)\n@click.option(\n    \"--shard-size\", type=int, default=10, help=\"The maximum size of each shard in mb\"\n)\ndef main(input, output, num_workers, text_extension, shard_size):\n    generator_fns = []\n\n    for f in input:\n        assert f.exists(), f\"{f} not found\"\n\n        if f.is_dir():\n            generator_fn = task_generator_folder(f, text_extension)\n        else:\n            generator_fn = task_generator_filelist(f)\n\n        generator_fns.append(generator_fn)\n\n    generator_fn = itertools.chain(*generator_fns)\n    output.mkdir(parents=True, exist_ok=True)\n\n    dataset_fp = None\n    tar_idx = 0\n    written_size = 0\n\n    with Pool(num_workers) as p:\n        for result in tqdm(p.imap_unordered(run_task, generator_fn)):\n            if dataset_fp is None:\n                dataset_fp = open(Path(output) / f\"{tar_idx:08d}.protos\", \"wb\")\n\n            dataset_fp.write(result)\n            written_size += len(result)\n\n            if written_size > shard_size * 1024 * 1024:\n                logger.info(f\"Finished writing {tar_idx} shards to {output}\")\n                dataset_fp.close()\n                dataset_fp = None\n                written_size = 0\n                tar_idx += 1\n\n    if dataset_fp is not None:\n        dataset_fp.close()\n\n    logger.info(f\"Finished writing {tar_idx + 1} shards to {output}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/llama/eval_in_context.py",
    "content": "import pyrootutils\nimport torch\nimport torch.nn.functional as F\nfrom matplotlib import pyplot as plt\nfrom transformers import AutoTokenizer\n\n# register eval resolver and root\npyrootutils.setup_root(__file__, indicator=\".project-root\", pythonpath=True)\n\nfrom torch.utils.data import DataLoader\n\nfrom fish_speech.datasets.semantic import AutoAugTextDataset, TextDataCollator\nfrom fish_speech.models.text2semantic.inference import load_model\n\n\ndef smooth(\n    scalars: list[float], weight: float\n) -> list[float]:  # Weight between 0 and 1\n    last = scalars[0]  # First value in the plot (first timestep)\n    smoothed = list()\n    for point in scalars:\n        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value\n        smoothed.append(smoothed_val)  # Save it\n        last = smoothed_val  # Anchor the last smoothed value\n\n    return smoothed\n\n\n@torch.inference_mode()\ndef analyze_one_model(loader, config, weight, max_length):\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n    model = load_model(\n        config,\n        weight,\n        device,\n        torch.bfloat16,\n        max_length,\n        compile=False,\n    )[0]\n\n    current_step = 0\n    model.eval()\n\n    semantic_loss_sum = torch.zeros(\n        max_length,\n        dtype=torch.float32,\n        device=device,\n    )\n    counter = torch.zeros(\n        max_length,\n        dtype=torch.long,\n        device=device,\n    )\n\n    for batch in loader:\n        batch = {k: v.to(device) for k, v in batch.items()}\n\n        labels = batch[\"labels\"]\n        outputs = model(\n            inp=batch[\"inputs\"],\n            key_padding_mask=batch[\"attention_masks\"],\n        )\n\n        token_logits = outputs.token_logits\n        codebook_logits = outputs.codebook_logits\n\n        # Generate labels\n        base_loss = F.cross_entropy(\n            token_logits.reshape(-1, token_logits.size(-1)),\n            labels[:, 0].reshape(-1),\n            ignore_index=-100,\n            reduction=\"none\",\n        )\n\n        codebook_labels = labels[:, 1 : 1 + model.config.num_codebooks].mT\n        semantic_loss = F.cross_entropy(\n            codebook_logits.reshape(-1, codebook_logits.size(-1)),\n            codebook_labels.reshape(-1),\n            ignore_index=-100,\n            reduction=\"none\",\n        )\n\n        base_loss = base_loss.reshape(labels[:, 0].shape)\n        semantic_loss = semantic_loss.reshape(codebook_labels.shape)\n\n        semantic_loss_frame = semantic_loss.mean(-1)\n        pad_pos = codebook_labels.sum(-1) == -100 * model.config.num_codebooks\n\n        for loss_sample, pad in zip(semantic_loss_frame, pad_pos):\n            semantic_loss_sum[~pad] += loss_sample[~pad]\n            counter[~pad] += 1\n\n        current_step += 1\n        if current_step == 10:\n            break\n\n    semantic_loss = semantic_loss.cpu()\n    counter = counter.cpu()\n    xs, ys = [], []\n\n    for i, (loss, count) in enumerate(zip(semantic_loss_sum, counter)):\n        if count > 0:\n            xs.append(i)\n            ys.append((loss / count).item())  # for better loss visualization\n\n    smoothed_ys = smooth(ys, 0.95)\n\n    # Unload model\n    del model\n    torch.cuda.empty_cache()\n\n    return xs, ys, smoothed_ys\n\n\ndef main():\n    tokenizer = AutoTokenizer.from_pretrained(\"fishaudio/fish-speech-1\")\n    max_length = 4096\n\n    ds = AutoAugTextDataset(\n        [\"data/protos/sft/云天河\"],\n        tokenizer=tokenizer,\n        use_speaker=False,\n        interactive_prob=1.0,\n        max_length=max_length,\n    )\n\n    loader = DataLoader(\n        ds,\n        batch_size=8,\n        collate_fn=TextDataCollator(tokenizer, max_length=max_length),\n        num_workers=0,\n        shuffle=False,\n    )\n\n    plt.figure(figsize=(10, 5), dpi=200)\n\n    plt.xlabel(\"Frame\")\n    plt.ylabel(\"Loss\")\n    plt.yscale(\"log\")\n    plt.title(\"Semantic Loss\")\n    plt.grid(which=\"both\", axis=\"both\")\n    plt.xlim(0, max_length)\n\n    tests = [\n        (\n            \"pertrain-medium\",\n            \"dual_ar_2_codebook_medium\",\n            \"checkpoints/text2semantic-pretrain-medium-2k-v1.pth\",\n        ),\n        (\n            \"sft-medium\",\n            \"dual_ar_2_codebook_medium\",\n            \"checkpoints/text2semantic-sft-medium-v1.1-4k.pth\",\n        ),\n        (\n            \"sft-large\",\n            \"dual_ar_2_codebook_large\",\n            \"checkpoints/text2semantic-sft-large-v1.1-4k.pth\",\n        ),\n    ]\n\n    for name, config, weight in tests:\n        xs, _, smoothed_ys = analyze_one_model(loader, config, weight, max_length)\n        plt.plot(xs, smoothed_ys, label=name)\n\n    plt.legend()\n    plt.savefig(\"semantic_loss.png\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/llama/merge_lora.py",
    "content": "import shutil\nfrom copy import deepcopy\nfrom pathlib import Path\n\nimport click\nimport hydra\nimport torch\nfrom hydra import compose, initialize\nfrom hydra.utils import instantiate\nfrom loguru import logger\n\nfrom fish_speech.models.text2semantic.llama import BaseTransformer\nfrom fish_speech.models.text2semantic.lora import get_merged_state_dict\n\n\n@click.command()\n@click.option(\"--lora-config\", type=str, default=\"r_8_alpha_16\")\n@click.option(\"--base-weight\", type=str, default=\"checkpoints/fish-speech-1.4\")\n@click.option(\"--lora-weight\", type=str, required=True)\n@click.option(\"--output\", type=str, required=True)\ndef merge(lora_config, base_weight, lora_weight, output):\n    output = Path(output)\n    logger.info(\n        f\"Merging {base_weight} and {lora_weight} into {output} with {lora_config}\"\n    )\n\n    with initialize(version_base=\"1.3\", config_path=\"../../fish_speech/configs/lora\"):\n        cfg = compose(config_name=lora_config)\n\n    lora_config = instantiate(cfg)\n    logger.info(f\"Loaded lora model with config {lora_config}\")\n\n    llama_model = BaseTransformer.from_pretrained(\n        path=base_weight,\n        load_weights=True,\n        lora_config=lora_config,\n    )\n    logger.info(f\"Loaded llama model\")\n\n    llama_state_dict = llama_model.state_dict()\n    llama_state_dict = {k: v for k, v in llama_state_dict.items() if \"lora\" not in k}\n    llama_state_dict_copy = deepcopy(llama_state_dict)\n    lora_state_dict = torch.load(lora_weight, map_location=\"cpu\", weights_only=False)\n\n    if \"state_dict\" in llama_state_dict:\n        llama_state_dict = llama_state_dict[\"state_dict\"]\n\n    if \"state_dict\" in lora_state_dict:\n        lora_state_dict = lora_state_dict[\"state_dict\"]\n\n    # remove prefix model.\n    if any(k.startswith(\"model.\") for k in llama_state_dict.keys()):\n        llama_state_dict = {\n            k.replace(\"model.\", \"\"): v\n            for k, v in llama_state_dict.items()\n            if k.startswith(\"model.\")\n        }\n    if any(k.startswith(\"model.\") for k in lora_state_dict.keys()):\n        lora_state_dict = {\n            k.replace(\"model.\", \"\"): v\n            for k, v in lora_state_dict.items()\n            if k.startswith(\"model.\")\n        }\n\n    logger.info(f\"Found {len(llama_state_dict)} keys in llama model\")\n    logger.info(f\"Found {len(lora_state_dict)} keys in lora model\")\n\n    merged_state_dict = llama_state_dict | lora_state_dict\n    llama_model.load_state_dict(merged_state_dict, strict=True)\n    logger.info(f\"Merged model loaded\")\n\n    # Trigger eval mode to merge lora\n    llama_model.eval()\n    llama_model.save_pretrained(output, drop_lora=True)\n    logger.info(f\"Saved merged model to {output}, validating\")\n\n    new_state_dict = torch.load(output / \"model.pth\", map_location=\"cpu\")\n    original_keys = set(llama_state_dict_copy.keys())\n\n    tolerance = 1e-5\n    for key in original_keys:\n        diff_l1 = (new_state_dict[key] - llama_state_dict_copy[key]).abs().sum().item()\n        if diff_l1 > tolerance:\n            logger.info(f\"Significant difference found in key: {key}\")\n            break\n\n    if diff_l1 <= tolerance:\n        logger.warning(\n            \"Merged model seems identical to the original model. Further validation might be needed.\"\n        )\n    else:\n        logger.info(\"Merged model is different from the original model, check passed\")\n\n\nif __name__ == \"__main__\":\n    merge()\n"
  },
  {
    "path": "tools/llama/quantize.py",
    "content": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n# All rights reserved.\nimport datetime\nimport shutil\n\n# This source code is licensed under the license found in the\n# LICENSE file in the root directory of this source tree.\nimport time\nfrom pathlib import Path\n\nimport click\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom fish_speech.models.text2semantic.inference import load_model\nfrom fish_speech.models.text2semantic.llama import find_multiple\n\n##### Quantization Primitives ######\n\n\ndef dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):\n    # assumes symmetric quantization\n    # assumes axis == 0\n    # assumes dense memory format\n    # TODO(future): relax ^ as needed\n\n    # default setup for affine quantization of activations\n    eps = torch.finfo(torch.float32).eps\n\n    # get min and max\n    min_val, max_val = torch.aminmax(x, dim=1)\n\n    # calculate scales and zero_points based on min and max\n    # reference: https://fburl.com/code/srbiybme\n    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))\n    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))\n    device = min_val_neg.device\n\n    # reference: https://fburl.com/code/4wll53rk\n    max_val_pos = torch.max(-min_val_neg, max_val_pos)\n    scales = max_val_pos / (float(quant_max - quant_min) / 2)\n    # ensure scales is the same dtype as the original tensor\n    scales = torch.clamp(scales, min=eps).to(x.dtype)\n    zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)\n\n    # quantize based on qmin/qmax/scales/zp\n    # reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63\n    x_div = x / scales.unsqueeze(-1)\n    x_round = torch.round(x_div)\n    x_zp = x_round + zero_points.unsqueeze(-1)\n    quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)\n\n    return quant, scales, zero_points\n\n\ndef get_group_qparams(w, n_bit=4, groupsize=128):\n    # needed for GPTQ with padding\n    if groupsize > w.shape[-1]:\n        groupsize = w.shape[-1]\n    assert groupsize > 1\n    assert w.shape[-1] % groupsize == 0\n    assert w.dim() == 2\n\n    to_quant = w.reshape(-1, groupsize)\n    assert torch.isnan(to_quant).sum() == 0\n\n    max_val = to_quant.amax(dim=1, keepdim=True)\n    min_val = to_quant.amin(dim=1, keepdim=True)\n    max_int = 2**n_bit - 1\n    scales = (max_val - min_val).clamp(min=1e-6) / max_int\n    zeros = min_val + scales * (2 ** (n_bit - 1))\n    return scales.to(torch.bfloat16).reshape(w.shape[0], -1), zeros.to(\n        torch.bfloat16\n    ).reshape(w.shape[0], -1)\n\n\ndef pack_scales_and_zeros(scales, zeros):\n    assert scales.shape == zeros.shape\n    assert scales.dtype == torch.bfloat16\n    assert zeros.dtype == torch.bfloat16\n    return (\n        torch.cat(\n            [\n                scales.reshape(scales.size(0), scales.size(1), 1),\n                zeros.reshape(zeros.size(0), zeros.size(1), 1),\n            ],\n            2,\n        )\n        .transpose(0, 1)\n        .contiguous()\n    )\n\n\ndef unpack_scales_and_zeros(scales_and_zeros):\n    assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2\n    assert scales_and_zeros.dtype == torch.float\n    return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)\n\n\ndef group_quantize_tensor_from_qparams(w, scales, zeros, n_bit=4, groupsize=128):\n    assert groupsize > 1\n    # needed for GPTQ single column quantize\n    if groupsize > w.shape[-1] and scales.shape[-1] == 1:\n        groupsize = w.shape[-1]\n\n    assert w.shape[-1] % groupsize == 0\n    assert w.dim() == 2\n\n    to_quant = w.reshape(-1, groupsize)\n    assert torch.isnan(to_quant).sum() == 0\n\n    scales = scales.reshape(-1, 1)\n    zeros = zeros.reshape(-1, 1)\n    min_val = zeros - scales * (2 ** (n_bit - 1))\n    max_int = 2**n_bit - 1\n    min_int = 0\n    w_int32 = (\n        to_quant.sub(min_val)\n        .div(scales)\n        .round()\n        .clamp_(min_int, max_int)\n        .to(torch.int32)\n        .reshape_as(w)\n    )\n\n    return w_int32\n\n\ndef group_quantize_tensor(w, n_bit=4, groupsize=128):\n    scales, zeros = get_group_qparams(w, n_bit, groupsize)\n    w_int32 = group_quantize_tensor_from_qparams(w, scales, zeros, n_bit, groupsize)\n    scales_and_zeros = pack_scales_and_zeros(scales, zeros)\n    return w_int32, scales_and_zeros\n\n\ndef group_dequantize_tensor_from_qparams(\n    w_int32, scales, zeros, n_bit=4, groupsize=128\n):\n    assert groupsize > 1\n    # needed for GPTQ single column dequantize\n    if groupsize > w_int32.shape[-1] and scales.shape[-1] == 1:\n        groupsize = w_int32.shape[-1]\n    assert w_int32.shape[-1] % groupsize == 0\n    assert w_int32.dim() == 2\n\n    w_int32_grouped = w_int32.reshape(-1, groupsize)\n    scales = scales.reshape(-1, 1)\n    zeros = zeros.reshape(-1, 1)\n\n    w_dq = (\n        w_int32_grouped.sub(2 ** (n_bit - 1)).mul(scales).add(zeros).reshape_as(w_int32)\n    )\n    return w_dq\n\n\ndef group_dequantize_tensor(w_int32, scales_and_zeros, n_bit=4, groupsize=128):\n    scales, zeros = unpack_scales_and_zeros(scales_and_zeros)\n    return group_dequantize_tensor_from_qparams(\n        w_int32, scales, zeros, n_bit, groupsize\n    )\n\n\nclass QuantHandler:\n    def __init__(self, mod):\n        self.mod = mod\n\n    def create_quantized_state_dict(self) -> \"StateDict\":\n        pass\n\n    def convert_for_runtime(self) -> \"nn.Module\":\n        pass\n\n\n##### Weight-only int8 per-channel quantized code ######\n\n\ndef replace_linear_weight_only_int8_per_channel(module):\n    for name, child in module.named_children():\n        if isinstance(child, nn.Linear):\n            setattr(\n                module,\n                name,\n                WeightOnlyInt8Linear(child.in_features, child.out_features),\n            )\n        else:\n            replace_linear_weight_only_int8_per_channel(child)\n\n\nclass WeightOnlyInt8QuantHandler:\n    def __init__(self, mod):\n        self.mod = mod\n\n    @torch.no_grad()\n    def create_quantized_state_dict(self):\n        cur_state_dict = self.mod.state_dict()\n        for fqn, mod in self.mod.named_modules():\n            if isinstance(mod, torch.nn.Linear):\n                int8_weight, scales, _ = dynamically_quantize_per_channel(\n                    mod.weight.float(), -128, 127, torch.int8\n                )\n                cur_state_dict[f\"{fqn}.weight\"] = int8_weight\n                cur_state_dict[f\"{fqn}.scales\"] = scales.to(mod.weight.dtype)\n\n        return cur_state_dict\n\n    def convert_for_runtime(self):\n        replace_linear_weight_only_int8_per_channel(self.mod)\n        return self.mod\n\n\nclass WeightOnlyInt8Linear(torch.nn.Module):\n    __constants__ = [\"in_features\", \"out_features\"]\n    in_features: int\n    out_features: int\n    weight: torch.Tensor\n\n    def __init__(\n        self,\n        in_features: int,\n        out_features: int,\n        bias: bool = True,\n        device=None,\n        dtype=None,\n    ) -> None:\n        factory_kwargs = {\"device\": device, \"dtype\": dtype}\n        super().__init__()\n        self.in_features = in_features\n        self.out_features = out_features\n        self.register_buffer(\n            \"weight\", torch.empty((out_features, in_features), dtype=torch.int8)\n        )\n        self.register_buffer(\"scales\", torch.ones(out_features, dtype=torch.bfloat16))\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales\n\n\n##### weight only int4 per channel groupwise quantized code ######\n\n\ndef prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_tiles):\n    weight_int32, scales_and_zeros = group_quantize_tensor(\n        weight_bf16, n_bit=4, groupsize=groupsize\n    )\n    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(\n        weight_int32, inner_k_tiles\n    )\n    return weight_int4pack, scales_and_zeros\n\n\ndef linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize):\n    origin_x_size = x.size()\n    x = x.reshape(-1, origin_x_size[-1])\n    c = torch.ops.aten._weight_int4pack_mm(\n        x, weight_int4pack, groupsize, scales_and_zeros\n    )\n    new_shape = origin_x_size[:-1] + (out_features,)\n    c = c.reshape(new_shape)\n    return c\n\n\ndef _check_linear_int4_k(k, groupsize=1, inner_k_tiles=1):\n    return k % groupsize == 0 and k % (inner_k_tiles * 16) == 0\n\n\ndef replace_linear_int4(module, groupsize, inner_k_tiles, padding):\n    for name, child in module.named_children():\n        if isinstance(child, nn.Linear):\n            if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles):\n                setattr(\n                    module,\n                    name,\n                    WeightOnlyInt4Linear(\n                        child.in_features,\n                        child.out_features,\n                        bias=False,\n                        groupsize=groupsize,\n                        inner_k_tiles=inner_k_tiles,\n                        padding=False,\n                    ),\n                )\n            elif padding:\n                setattr(\n                    module,\n                    name,\n                    WeightOnlyInt4Linear(\n                        child.in_features,\n                        child.out_features,\n                        bias=False,\n                        groupsize=groupsize,\n                        inner_k_tiles=inner_k_tiles,\n                        padding=True,\n                    ),\n                )\n        else:\n            replace_linear_int4(child, groupsize, inner_k_tiles, padding)\n\n\nclass WeightOnlyInt4QuantHandler:\n    def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):\n        self.mod = mod\n        self.groupsize = groupsize\n        self.inner_k_tiles = inner_k_tiles\n        self.padding = padding\n        assert groupsize in [32, 64, 128, 256]\n        assert inner_k_tiles in [2, 4, 8]\n\n    @torch.no_grad()\n    def create_quantized_state_dict(self):\n        cur_state_dict = self.mod.state_dict()\n        for fqn, mod in self.mod.named_modules():\n            if isinstance(mod, torch.nn.Linear):\n                assert not mod.bias\n                out_features = mod.out_features\n                in_features = mod.in_features\n                assert out_features % 8 == 0, \"require out_features % 8 == 0\"\n                print(f\"linear: {fqn}, in={in_features}, out={out_features}\")\n\n                weight = mod.weight.data\n                if not _check_linear_int4_k(\n                    in_features, self.groupsize, self.inner_k_tiles\n                ):\n                    if self.padding:\n                        import torch.nn.functional as F\n\n                        print(\n                            f\"warning: {fqn} is padded to satisfy in_features % 1024 == 0\"\n                        )\n                        padded_in_features = find_multiple(in_features, 1024)\n                        weight = F.pad(\n                            weight, pad=(0, padded_in_features - in_features)\n                        )\n                    else:\n                        print(\n                            f\"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, \"\n                            + \"and that groupsize and inner_k_tiles*16 evenly divide into it\"\n                        )\n                        continue\n                (\n                    weight_int4pack,\n                    scales_and_zeros,\n                ) = prepare_int4_weight_and_scales_and_zeros(\n                    weight.to(torch.bfloat16).to(\"cuda\"),\n                    self.groupsize,\n                    self.inner_k_tiles,\n                )\n                cur_state_dict[f\"{fqn}.weight\"] = weight_int4pack.to(\"cpu\")\n                cur_state_dict[f\"{fqn}.scales_and_zeros\"] = scales_and_zeros.to(\"cpu\")\n\n        return cur_state_dict\n\n    def convert_for_runtime(self):\n        replace_linear_int4(self.mod, self.groupsize, self.inner_k_tiles, self.padding)\n        return self.mod\n\n\nclass WeightOnlyInt4Linear(torch.nn.Module):\n    __constants__ = [\"in_features\", \"out_features\"]\n    in_features: int\n    out_features: int\n    weight: torch.Tensor\n\n    def __init__(\n        self,\n        in_features: int,\n        out_features: int,\n        bias=True,\n        device=None,\n        dtype=None,\n        groupsize: int = 128,\n        inner_k_tiles: int = 8,\n        padding: bool = True,\n    ) -> None:\n        super().__init__()\n        self.padding = padding\n        if padding:\n            self.origin_in_features = in_features\n            in_features = find_multiple(in_features, 1024)\n\n        self.in_features = in_features\n        self.out_features = out_features\n        assert not bias, \"require bias=False\"\n        self.groupsize = groupsize\n        self.inner_k_tiles = inner_k_tiles\n\n        assert out_features % 8 == 0, \"require out_features % 8 == 0\"\n        assert (\n            in_features % (inner_k_tiles * 16) == 0\n        ), \"require in_features % (innerKTiles * 16) == 0\"\n        self.register_buffer(\n            \"weight\",\n            torch.empty(\n                (\n                    out_features // 8,\n                    in_features // (inner_k_tiles * 16),\n                    32,\n                    inner_k_tiles // 2,\n                ),\n                dtype=torch.int32,\n            ),\n        )\n        self.register_buffer(\n            \"scales_and_zeros\",\n            torch.empty(\n                (in_features // groupsize, out_features, 2), dtype=torch.bfloat16\n            ),\n        )\n\n    def forward(self, input: torch.Tensor) -> torch.Tensor:\n        input = input.to(torch.bfloat16)\n        if self.padding:\n            import torch.nn.functional as F\n\n            input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))\n        return linear_forward_int4(\n            input, self.weight, self.scales_and_zeros, self.out_features, self.groupsize\n        )\n\n\ndef generate_folder_name():\n    now = datetime.datetime.now()\n    folder_name = now.strftime(\"%Y%m%d_%H%M%S\")\n    return folder_name\n\n\n@click.command()\n@click.option(\n    \"--checkpoint-path\",\n    type=click.Path(path_type=Path, exists=True),\n    default=\"checkpoints/fish-speech-1.4\",\n)\n@click.option(\n    \"--mode\", type=str, default=\"int8\", help=\"type of quantization to perform\"\n)\n@click.option(\n    \"--groupsize\", type=int, default=128, help=\"Group size for int4 quantization.\"\n)\n@click.option(\"--timestamp\", type=str, default=\"None\", help=\"When to do quantization\")\ndef quantize(checkpoint_path: Path, mode: str, groupsize: int, timestamp: str) -> None:\n\n    device = \"cpu\"\n    precision = torch.bfloat16\n\n    print(\"Loading model ...\")\n    t0 = time.time()\n\n    model, _ = load_model(\n        checkpoint_path=checkpoint_path,\n        device=device,\n        precision=precision,\n        compile=False,\n    )\n    vq_model = \"codec.pth\"\n    now = timestamp if timestamp != \"None\" else generate_folder_name()\n\n    if mode == \"int8\":\n        print(\n            \"Quantizing model weights for int8 weight-only symmetric per-channel quantization\"\n        )\n        quant_handler = WeightOnlyInt8QuantHandler(model)\n        quantized_state_dict = quant_handler.create_quantized_state_dict()\n\n        dir_name = checkpoint_path\n        dst_name = Path(f\"checkpoints/fs-1.2-int8-{now}\")\n        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))\n        if (dst_name / vq_model).exists():\n            (dst_name / vq_model).unlink()\n        quantize_path = dst_name / \"model.pth\"\n\n    elif mode == \"int4\":\n        print(\n            \"Quantizing model weights for int4 weight-only affine per-channel groupwise quantization\"\n        )\n        quant_handler = WeightOnlyInt4QuantHandler(model, groupsize)\n        quantized_state_dict = quant_handler.create_quantized_state_dict()\n\n        dir_name = checkpoint_path\n        dst_name = Path(f\"checkpoints/fs-1.2-int4-g{groupsize}-{now}\")\n        shutil.copytree(str(dir_name.resolve()), str(dst_name.resolve()))\n        if (dst_name / vq_model).exists():\n            (dst_name / vq_model).unlink()\n        quantize_path = dst_name / \"model.pth\"\n\n    else:\n        raise ValueError(\n            f\"Invalid quantization mode {mode} needs to be one of [int8, int4, int4-gpptq]\"\n        )\n\n    print(f\"Writing quantized weights to {quantize_path}\")\n    quantize_path.unlink(missing_ok=True)  # remove existing file if one already there\n    torch.save(quantized_state_dict, quantize_path)\n    print(f\"Quantization complete took {time.time() - t0:.02f} seconds\")\n\n\nif __name__ == \"__main__\":\n    quantize()\n"
  },
  {
    "path": "tools/run_webui.py",
    "content": "import os\nfrom argparse import ArgumentParser\nfrom pathlib import Path\n\nimport pyrootutils\nimport torch\nfrom loguru import logger\n\npyrootutils.setup_root(__file__, indicator=\".project-root\", pythonpath=True)\n\nfrom fish_speech.inference_engine import TTSInferenceEngine\nfrom fish_speech.models.dac.inference import load_model as load_decoder_model\nfrom fish_speech.models.text2semantic.inference import launch_thread_safe_queue\nfrom fish_speech.utils.schema import ServeTTSRequest\nfrom tools.webui import build_app\nfrom tools.webui.inference import get_inference_wrapper\n\n# Make einx happy\nos.environ[\"EINX_FILTER_TRACEBACK\"] = \"false\"\n\n\ndef parse_args():\n    parser = ArgumentParser()\n    parser.add_argument(\n        \"--llama-checkpoint-path\",\n        type=Path,\n        default=\"checkpoints/s2-pro\",\n    )\n    parser.add_argument(\n        \"--decoder-checkpoint-path\",\n        type=Path,\n        default=\"checkpoints/s2-pro/codec.pth\",\n    )\n    parser.add_argument(\"--decoder-config-name\", type=str, default=\"modded_dac_vq\")\n    parser.add_argument(\"--device\", type=str, default=\"cuda\")\n    parser.add_argument(\"--half\", action=\"store_true\")\n    parser.add_argument(\"--compile\", action=\"store_true\")\n    parser.add_argument(\"--max-gradio-length\", type=int, default=0)\n    parser.add_argument(\"--theme\", type=str, default=\"light\")\n\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n    args.precision = torch.half if args.half else torch.bfloat16\n\n    # Check if MPS or CUDA is available\n    if torch.backends.mps.is_available():\n        args.device = \"mps\"\n        logger.info(\"mps is available, running on mps.\")\n    elif torch.xpu.is_available():\n        args.device = \"xpu\"\n        logger.info(\"XPU is available, running on XPU.\")\n    elif not torch.cuda.is_available():\n        logger.info(\"CUDA is not available, running on CPU.\")\n        args.device = \"cpu\"\n\n    logger.info(\"Loading Llama model...\")\n    llama_queue = launch_thread_safe_queue(\n        checkpoint_path=args.llama_checkpoint_path,\n        device=args.device,\n        precision=args.precision,\n        compile=args.compile,\n    )\n\n    logger.info(\"Loading VQ-GAN model...\")\n    decoder_model = load_decoder_model(\n        config_name=args.decoder_config_name,\n        checkpoint_path=args.decoder_checkpoint_path,\n        device=args.device,\n    )\n\n    logger.info(\"Decoder model loaded, warming up...\")\n\n    # Create the inference engine\n    inference_engine = TTSInferenceEngine(\n        llama_queue=llama_queue,\n        decoder_model=decoder_model,\n        compile=args.compile,\n        precision=args.precision,\n    )\n\n    # Dry run to check if the model is loaded correctly and avoid the first-time latency\n    list(\n        inference_engine.inference(\n            ServeTTSRequest(\n                text=\"Hello world.\",\n                references=[],\n                reference_id=None,\n                max_new_tokens=1024,\n                chunk_length=200,\n                top_p=0.7,\n                repetition_penalty=1.5,\n                temperature=0.7,\n                format=\"wav\",\n            )\n        )\n    )\n\n    logger.info(\"Warming up done, launching the web UI...\")\n\n    # Get the inference function with the immutable arguments\n    inference_fct = get_inference_wrapper(inference_engine)\n\n    app = build_app(inference_fct, args.theme)\n    app.launch()\n"
  },
  {
    "path": "tools/server/api_utils.py",
    "content": "from argparse import ArgumentParser\nfrom http import HTTPStatus\nfrom typing import Annotated, Any\n\nimport ormsgpack\nfrom baize.datastructures import ContentType\nfrom kui.asgi import (\n    HTTPException,\n    HttpRequest,\n    JSONResponse,\n    request,\n)\nfrom loguru import logger\nfrom pydantic import BaseModel\n\nfrom fish_speech.inference_engine import TTSInferenceEngine\nfrom fish_speech.utils.schema import ServeTTSRequest\nfrom tools.server.inference import inference_wrapper as inference\n\n\ndef parse_args():\n    parser = ArgumentParser()\n    parser.add_argument(\"--mode\", type=str, choices=[\"tts\"], default=\"tts\")\n    parser.add_argument(\n        \"--llama-checkpoint-path\",\n        type=str,\n        default=\"checkpoints/s2-pro\",\n    )\n    parser.add_argument(\n        \"--decoder-checkpoint-path\",\n        type=str,\n        default=\"checkpoints/s2-pro/codec.pth\",\n    )\n    parser.add_argument(\"--decoder-config-name\", type=str, default=\"modded_dac_vq\")\n    parser.add_argument(\"--device\", type=str, default=\"cuda\")\n    parser.add_argument(\"--half\", action=\"store_true\")\n    parser.add_argument(\"--compile\", action=\"store_true\")\n    parser.add_argument(\"--max-text-length\", type=int, default=0)\n    parser.add_argument(\"--listen\", type=str, default=\"127.0.0.1:8080\")\n    parser.add_argument(\"--workers\", type=int, default=1)\n    parser.add_argument(\"--api-key\", type=str, default=None)\n\n    return parser.parse_args()\n\n\nclass MsgPackRequest(HttpRequest):\n    async def data(\n        self,\n    ) -> Annotated[\n        Any,\n        ContentType(\"application/msgpack\"),\n        ContentType(\"application/json\"),\n        ContentType(\"multipart/form-data\"),\n    ]:\n        if self.content_type == \"application/msgpack\":\n            return ormsgpack.unpackb(await self.body)\n\n        elif self.content_type == \"application/json\":\n            return await self.json\n\n        elif self.content_type == \"multipart/form-data\":\n            return await self.form\n\n        raise HTTPException(\n            HTTPStatus.UNSUPPORTED_MEDIA_TYPE,\n            headers={\n                \"Accept\": \"application/msgpack, application/json, multipart/form-data\"\n            },\n        )\n\n\nasync def inference_async(req: ServeTTSRequest, engine: TTSInferenceEngine):\n    for chunk in inference(req, engine):\n        print(\"Got chunk\")\n        if isinstance(chunk, bytes):\n            yield chunk\n\n\nasync def buffer_to_async_generator(buffer):\n    yield buffer\n\n\ndef get_content_type(audio_format):\n    if audio_format == \"wav\":\n        return \"audio/wav\"\n    elif audio_format == \"flac\":\n        return \"audio/flac\"\n    elif audio_format == \"mp3\":\n        return \"audio/mpeg\"\n    elif audio_format == \"opus\":\n        return \"audio/ogg\"\n    else:\n        return \"application/octet-stream\"\n\n\ndef wants_json(req):\n    \"\"\"Helper method to determine if the client wants a JSON response\n\n    Parameters\n    ----------\n    req : Request\n        The request object\n\n    Returns\n    -------\n    bool\n        True if the client wants a JSON response, False otherwise\n    \"\"\"\n    q = req.query_params.get(\"format\", \"\").strip().lower()\n    if q in {\"json\", \"application/json\", \"msgpack\", \"application/msgpack\"}:\n        return q == \"json\"\n    accept = req.headers.get(\"Accept\", \"\").strip().lower()\n    return \"application/json\" in accept and \"application/msgpack\" not in accept\n\n\ndef format_response(response: BaseModel, status_code=200):\n    \"\"\"\n    Helper function to format responses consistently based on client preference.\n\n    Parameters\n    ----------\n    response : BaseModel\n        The response object to format\n    status_code : int\n        HTTP status code (default: 200)\n\n    Returns\n    -------\n    Response\n        Formatted response in the client's preferred format\n    \"\"\"\n    try:\n        if wants_json(request):\n            return JSONResponse(\n                response.model_dump(mode=\"json\"), status_code=status_code\n            )\n\n        return (\n            ormsgpack.packb(\n                response,\n                option=ormsgpack.OPT_SERIALIZE_PYDANTIC,\n            ),\n            status_code,\n            {\"Content-Type\": \"application/msgpack\"},\n        )\n    except Exception as e:\n        logger.error(f\"Error formatting response: {e}\", exc_info=True)\n        # Fallback to JSON response if formatting fails\n        return JSONResponse(\n            {\"error\": \"Response formatting failed\", \"details\": str(e)}, status_code=500\n        )\n"
  },
  {
    "path": "tools/server/exception_handler.py",
    "content": "import traceback\nfrom http import HTTPStatus\n\nfrom kui.asgi import HTTPException, JSONResponse\n\n\nclass ExceptionHandler:\n\n    async def http_exception_handler(self, exc: HTTPException):\n        return JSONResponse(\n            dict(\n                statusCode=exc.status_code,\n                message=exc.content,\n                error=HTTPStatus(exc.status_code).phrase,\n            ),\n            exc.status_code,\n            exc.headers,\n        )\n\n    async def other_exception_handler(self, exc: Exception):\n        traceback.print_exc()\n\n        status = HTTPStatus.INTERNAL_SERVER_ERROR\n        return JSONResponse(\n            dict(statusCode=status, message=str(exc), error=status.phrase),\n            status,\n        )\n"
  },
  {
    "path": "tools/server/inference.py",
    "content": "from http import HTTPStatus\n\nimport numpy as np\nfrom kui.asgi import HTTPException\n\nfrom fish_speech.inference_engine import TTSInferenceEngine\nfrom fish_speech.utils.schema import ServeTTSRequest\n\nAMPLITUDE = 32768  # Needs an explaination\n\n\ndef inference_wrapper(req: ServeTTSRequest, engine: TTSInferenceEngine):\n    \"\"\"\n    Wrapper for the inference function.\n    Used in the API server.\n    \"\"\"\n    count = 0\n    for result in engine.inference(req):\n        match result.code:\n            case \"header\":\n                if isinstance(result.audio, tuple):\n                    yield result.audio[1]\n\n            case \"error\":\n                raise HTTPException(\n                    HTTPStatus.INTERNAL_SERVER_ERROR,\n                    content=str(result.error),\n                )\n\n            case \"segment\":\n                count += 1\n                if isinstance(result.audio, tuple):\n                    yield (result.audio[1] * AMPLITUDE).astype(np.int16).tobytes()\n\n            case \"final\":\n                count += 1\n                if isinstance(result.audio, tuple):\n                    yield result.audio[1]\n                return None  # Stop the generator\n\n    if count == 0:\n        raise HTTPException(\n            HTTPStatus.INTERNAL_SERVER_ERROR,\n            content=\"No audio generated, please check the input text.\",\n        )\n"
  },
  {
    "path": "tools/server/model_manager.py",
    "content": "import torch\nfrom loguru import logger\n\nfrom fish_speech.inference_engine import TTSInferenceEngine\nfrom fish_speech.models.dac.inference import load_model as load_decoder_model\nfrom fish_speech.models.text2semantic.inference import launch_thread_safe_queue\nfrom fish_speech.utils.schema import ServeTTSRequest\nfrom tools.server.inference import inference_wrapper as inference\n\n\nclass ModelManager:\n    def __init__(\n        self,\n        mode: str,\n        device: str,\n        half: bool,\n        compile: bool,\n        llama_checkpoint_path: str,\n        decoder_checkpoint_path: str,\n        decoder_config_name: str,\n    ) -> None:\n\n        self.mode = mode\n        self.device = device\n        self.half = half\n        self.compile = compile\n\n        self.precision = torch.half if half else torch.bfloat16\n\n        # Check if MPS or CUDA is available\n        if torch.backends.mps.is_available():\n            self.device = \"mps\"\n            logger.info(\"mps is available, running on mps.\")\n        elif not torch.cuda.is_available():\n            self.device = \"cpu\"\n            logger.info(\"CUDA is not available, running on CPU.\")\n\n        # Load the TTS models\n        self.load_llama_model(\n            llama_checkpoint_path, self.device, self.precision, self.compile, self.mode\n        )\n        self.load_decoder_model(\n            decoder_config_name, decoder_checkpoint_path, self.device\n        )\n        self.tts_inference_engine = TTSInferenceEngine(\n            llama_queue=self.llama_queue,\n            decoder_model=self.decoder_model,\n            precision=self.precision,\n            compile=self.compile,\n        )\n\n        # Warm up the models\n        if self.mode == \"tts\":\n            self.warm_up(self.tts_inference_engine)\n\n    def load_llama_model(\n        self, checkpoint_path, device, precision, compile, mode\n    ) -> None:\n\n        if mode == \"tts\":\n            self.llama_queue = launch_thread_safe_queue(\n                checkpoint_path=checkpoint_path,\n                device=device,\n                precision=precision,\n                compile=compile,\n            )\n        else:\n            raise ValueError(f\"Invalid mode: {mode}\")\n\n        logger.info(\"LLAMA model loaded.\")\n\n    def load_decoder_model(self, config_name, checkpoint_path, device) -> None:\n        self.decoder_model = load_decoder_model(\n            config_name=config_name,\n            checkpoint_path=checkpoint_path,\n            device=device,\n        )\n        logger.info(\"Decoder model loaded.\")\n\n    def warm_up(self, tts_inference_engine) -> None:\n        request = ServeTTSRequest(\n            text=\"Hello world.\",\n            references=[],\n            reference_id=None,\n            max_new_tokens=1024,\n            chunk_length=200,\n            top_p=0.7,\n            repetition_penalty=1.2,\n            temperature=0.7,\n            format=\"wav\",\n        )\n        list(inference(request, tts_inference_engine))\n        logger.info(\"Models warmed up.\")\n"
  },
  {
    "path": "tools/server/model_utils.py",
    "content": "import io\nimport re\n\nimport librosa\nimport torch\nimport torchaudio\nfrom cachetools import LRUCache, cached\n\nCACHE_MAXSIZE = 10000\nMICRO_BATCH_SIZE = 8\nASR_SAMPLE_RATE = 16000\nHUGE_GAP_THRESHOLD = 4000\n\n\n@torch.no_grad()\n@torch.autocast(device_type=\"cuda\", dtype=torch.half)\ndef batch_encode(model, audios_list: list[bytes]):\n    # Get sample rate from model\n    if hasattr(model, \"spec_transform\"):\n        sample_rate = model.spec_transform.sample_rate\n    else:\n        sample_rate = model.sample_rate\n\n    audios: list[torch.Tensor] = [\n        (\n            torch.from_numpy(librosa.load(io.BytesIO(audio), sr=sample_rate)[0])[None]\n            if isinstance(audio, bytes)\n            else audio\n        )\n        for audio in audios_list\n    ]\n\n    lengths = torch.tensor([audio.shape[-1] for audio in audios], device=model.device)\n    max_length = lengths.max().item()\n\n    print(f\"Encode max length: {max_length / sample_rate:.2f}s\")\n\n    padded = torch.stack(\n        [\n            torch.nn.functional.pad(audio, (0, int(max_length - audio.shape[-1])))\n            for audio in audios\n        ]\n    ).to(model.device)\n\n    features, feature_lengths = model.encode(padded, audio_lengths=lengths)\n    features, feature_lengths = features.cpu(), feature_lengths.cpu()\n\n    return [feature[..., :length] for feature, length in zip(features, feature_lengths)]\n\n\n@cached(\n    cache=LRUCache(maxsize=CACHE_MAXSIZE),\n    key=lambda model, audios: (model.device, tuple(audios)),\n)\ndef cached_vqgan_batch_encode(model, audios: list[bytes]):\n    return batch_encode(model, audios)\n\n\n@torch.no_grad()\n@torch.autocast(device_type=\"cuda\", dtype=torch.half)\ndef batch_vqgan_decode(model, features):\n    lengths = torch.tensor(\n        [feature.shape[-1] for feature in features], device=model.device\n    )\n    max_length = lengths.max().item()\n    padded = torch.stack(\n        [\n            torch.nn.functional.pad(feature, (0, max_length - feature.shape[-1]))\n            for feature in features\n        ]\n    ).to(model.device)\n\n    # If bs too large, we do micro batch decode\n    audios, audio_lengths = [], []\n    for i in range(0, padded.shape[0], MICRO_BATCH_SIZE):\n        audio, audio_length = model.decode(\n            padded[i : i + MICRO_BATCH_SIZE],\n            feature_lengths=lengths[i : i + MICRO_BATCH_SIZE],\n        )\n        audios.append(audio)\n        audio_lengths.append(audio_length)\n    audios = torch.cat(audios, dim=0)\n    audio_lengths = torch.cat(audio_lengths, dim=0)\n    audios, audio_lengths = audios.cpu(), audio_lengths.cpu()\n\n    return [audio[..., :length].numpy() for audio, length in zip(audios, audio_lengths)]\n"
  },
  {
    "path": "tools/server/views.py",
    "content": "import io\nimport os\nimport re\nimport shutil\nimport tempfile\nimport time\nfrom http import HTTPStatus\nfrom pathlib import Path\n\nimport numpy as np\nimport ormsgpack\nimport soundfile as sf\nimport torch\nfrom kui.asgi import (\n    Body,\n    HTTPException,\n    HttpView,\n    JSONResponse,\n    Routes,\n    StreamResponse,\n    UploadFile,\n    request,\n)\nfrom loguru import logger\nfrom typing_extensions import Annotated\n\nfrom fish_speech.utils.schema import (\n    AddReferenceRequest,\n    AddReferenceResponse,\n    DeleteReferenceResponse,\n    ListReferencesResponse,\n    ServeTTSRequest,\n    ServeVQGANDecodeRequest,\n    ServeVQGANDecodeResponse,\n    ServeVQGANEncodeRequest,\n    ServeVQGANEncodeResponse,\n    UpdateReferenceResponse,\n)\nfrom tools.server.api_utils import (\n    buffer_to_async_generator,\n    format_response,\n    get_content_type,\n    inference_async,\n)\nfrom tools.server.inference import inference_wrapper as inference\nfrom tools.server.model_manager import ModelManager\nfrom tools.server.model_utils import (\n    batch_vqgan_decode,\n    cached_vqgan_batch_encode,\n)\n\nMAX_NUM_SAMPLES = int(os.getenv(\"NUM_SAMPLES\", 1))\n\n_WEBUI_HTML = (\n    Path(__file__).parent.parent.parent / \"awesome_webui\" / \"dist\" / \"index.html\"\n)\n\nroutes = Routes()\n\n\n@routes.http(\"/ui\")\nclass WebUI(HttpView):\n    @classmethod\n    async def get(cls):\n        from kui.asgi import HTMLResponse\n\n        if _WEBUI_HTML.exists():\n            return HTMLResponse(_WEBUI_HTML.read_text(encoding=\"utf-8\"))\n        return JSONResponse(\n            {\"error\": \"WebUI not built. Run: cd awesome_webui && npm run build\"},\n            status_code=404,\n        )\n\n\n@routes.http(\"/v1/health\")\nclass Health(HttpView):\n    @classmethod\n    async def get(cls):\n        return JSONResponse({\"status\": \"ok\"})\n\n    @classmethod\n    async def post(cls):\n        return JSONResponse({\"status\": \"ok\"})\n\n\n@routes.http.post(\"/v1/vqgan/encode\")\nasync def vqgan_encode(req: Annotated[ServeVQGANEncodeRequest, Body(exclusive=True)]):\n    \"\"\"\n    Encode audio using VQGAN model.\n    \"\"\"\n    try:\n        # Get the model from the app\n        model_manager: ModelManager = request.app.state.model_manager\n        decoder_model = model_manager.decoder_model\n\n        # Encode the audio\n        start_time = time.time()\n        tokens = cached_vqgan_batch_encode(decoder_model, req.audios)\n        logger.info(\n            f\"[EXEC] VQGAN encode time: {(time.time() - start_time) * 1000:.2f}ms\"\n        )\n\n        # Return the response\n        return ormsgpack.packb(\n            ServeVQGANEncodeResponse(tokens=[i.tolist() for i in tokens]),\n            option=ormsgpack.OPT_SERIALIZE_PYDANTIC,\n        )\n    except Exception as e:\n        logger.error(f\"Error in VQGAN encode: {e}\", exc_info=True)\n        raise HTTPException(\n            HTTPStatus.INTERNAL_SERVER_ERROR, content=\"Failed to encode audio\"\n        )\n\n\n@routes.http.post(\"/v1/vqgan/decode\")\nasync def vqgan_decode(req: Annotated[ServeVQGANDecodeRequest, Body(exclusive=True)]):\n    \"\"\"\n    Decode tokens to audio using VQGAN model.\n    \"\"\"\n    try:\n        # Get the model from the app\n        model_manager: ModelManager = request.app.state.model_manager\n        decoder_model = model_manager.decoder_model\n\n        # Decode the audio\n        tokens = [torch.tensor(token, dtype=torch.int) for token in req.tokens]\n        start_time = time.time()\n        audios = batch_vqgan_decode(decoder_model, tokens)\n        logger.info(\n            f\"[EXEC] VQGAN decode time: {(time.time() - start_time) * 1000:.2f}ms\"\n        )\n        audios = [audio.astype(np.float16).tobytes() for audio in audios]\n\n        # Return the response\n        return ormsgpack.packb(\n            ServeVQGANDecodeResponse(audios=audios),\n            option=ormsgpack.OPT_SERIALIZE_PYDANTIC,\n        )\n    except Exception as e:\n        logger.error(f\"Error in VQGAN decode: {e}\", exc_info=True)\n        raise HTTPException(\n            HTTPStatus.INTERNAL_SERVER_ERROR, content=\"Failed to decode tokens to audio\"\n        )\n\n\n@routes.http.post(\"/v1/tts\")\nasync def tts(req: Annotated[ServeTTSRequest, Body(exclusive=True)]):\n    \"\"\"\n    Generate speech from text using TTS model.\n    \"\"\"\n    try:\n        # Get the model from the app\n        app_state = request.app.state\n        model_manager: ModelManager = app_state.model_manager\n        engine = model_manager.tts_inference_engine\n        sample_rate = engine.decoder_model.sample_rate\n\n        # Check if the text is too long\n        if app_state.max_text_length > 0 and len(req.text) > app_state.max_text_length:\n            raise HTTPException(\n                HTTPStatus.BAD_REQUEST,\n                content=f\"Text is too long, max length is {app_state.max_text_length}\",\n            )\n\n        # Check if streaming is enabled\n        if req.streaming and req.format != \"wav\":\n            raise HTTPException(\n                HTTPStatus.BAD_REQUEST,\n                content=\"Streaming only supports WAV format\",\n            )\n\n        # Perform TTS\n        if req.streaming:\n            return StreamResponse(\n                iterable=inference_async(req, engine),\n                headers={\n                    \"Content-Disposition\": f\"attachment; filename=audio.{req.format}\",\n                },\n                content_type=get_content_type(req.format),\n            )\n        else:\n            fake_audios = next(inference(req, engine))\n            buffer = io.BytesIO()\n            sf.write(\n                buffer,\n                fake_audios,\n                sample_rate,\n                format=req.format,\n            )\n\n            return StreamResponse(\n                iterable=buffer_to_async_generator(buffer.getvalue()),\n                headers={\n                    \"Content-Disposition\": f\"attachment; filename=audio.{req.format}\",\n                },\n                content_type=get_content_type(req.format),\n            )\n    except HTTPException:\n        # Re-raise HTTP exceptions as they are already properly formatted\n        raise\n    except Exception as e:\n        logger.error(f\"Error in TTS generation: {e}\", exc_info=True)\n        raise HTTPException(\n            HTTPStatus.INTERNAL_SERVER_ERROR, content=\"Failed to generate speech\"\n        )\n\n\n@routes.http.post(\"/v1/references/add\")\nasync def add_reference(\n    id: str = Body(...), audio: UploadFile = Body(...), text: str = Body(...)\n):\n    \"\"\"\n    Add a new reference voice with audio file and text.\n    \"\"\"\n    temp_file_path = None\n\n    try:\n        # Validate input parameters\n        if not id or not id.strip():\n            raise ValueError(\"Reference ID cannot be empty\")\n\n        if not text or not text.strip():\n            raise ValueError(\"Reference text cannot be empty\")\n\n        # Get the model manager to access the reference loader\n        app_state = request.app.state\n        model_manager: ModelManager = app_state.model_manager\n        engine = model_manager.tts_inference_engine\n\n        # Read the uploaded audio file\n        audio_content = audio.read()\n        if not audio_content:\n            raise ValueError(\"Audio file is empty or could not be read\")\n\n        # Create a temporary file for the audio data\n        with tempfile.NamedTemporaryFile(delete=False, suffix=\".wav\") as temp_file:\n            temp_file.write(audio_content)\n            temp_file_path = temp_file.name\n\n        # Add the reference using the engine's reference loader\n        engine.add_reference(id, temp_file_path, text)\n\n        response = AddReferenceResponse(\n            success=True,\n            message=f\"Reference voice '{id}' added successfully\",\n            reference_id=id,\n        )\n        return format_response(response)\n\n    except FileExistsError as e:\n        logger.warning(f\"Reference ID '{id}' already exists: {e}\")\n        response = AddReferenceResponse(\n            success=False,\n            message=f\"Reference ID '{id}' already exists\",\n            reference_id=id,\n        )\n        return format_response(response, status_code=409)  # Conflict\n\n    except ValueError as e:\n        logger.warning(f\"Invalid input for reference '{id}': {e}\")\n        response = AddReferenceResponse(success=False, message=str(e), reference_id=id)\n        return format_response(response, status_code=400)\n\n    except (FileNotFoundError, OSError) as e:\n        logger.error(f\"File system error for reference '{id}': {e}\")\n        response = AddReferenceResponse(\n            success=False, message=\"File system error occurred\", reference_id=id\n        )\n        return format_response(response, status_code=500)\n\n    except Exception as e:\n        logger.error(f\"Unexpected error adding reference '{id}': {e}\", exc_info=True)\n        response = AddReferenceResponse(\n            success=False, message=\"Internal server error occurred\", reference_id=id\n        )\n        return format_response(response, status_code=500)\n\n    finally:\n        # Clean up temporary file\n        if temp_file_path and os.path.exists(temp_file_path):\n            try:\n                os.unlink(temp_file_path)\n            except OSError as e:\n                logger.warning(\n                    f\"Failed to clean up temporary file {temp_file_path}: {e}\"\n                )\n\n\n@routes.http.get(\"/v1/references/list\")\nasync def list_references():\n    \"\"\"\n    Get a list of all available reference voice IDs.\n    \"\"\"\n    try:\n        # Get the model manager to access the reference loader\n        app_state = request.app.state\n        model_manager: ModelManager = app_state.model_manager\n        engine = model_manager.tts_inference_engine\n\n        # Get the list of reference IDs\n        reference_ids = engine.list_reference_ids()\n\n        response = ListReferencesResponse(\n            success=True,\n            reference_ids=reference_ids,\n            message=f\"Found {len(reference_ids)} reference voices\",\n        )\n        return format_response(response)\n\n    except Exception as e:\n        logger.error(f\"Unexpected error listing references: {e}\", exc_info=True)\n        response = ListReferencesResponse(\n            success=False, reference_ids=[], message=\"Internal server error occurred\"\n        )\n        return format_response(response, status_code=500)\n\n\n@routes.http.delete(\"/v1/references/delete\")\nasync def delete_reference(reference_id: str = Body(...)):\n    \"\"\"\n    Delete a reference voice by ID.\n    \"\"\"\n    try:\n        # Validate input parameters\n        if not reference_id or not reference_id.strip():\n            raise ValueError(\"Reference ID cannot be empty\")\n\n        # Get the model manager to access the reference loader\n        app_state = request.app.state\n        model_manager: ModelManager = app_state.model_manager\n        engine = model_manager.tts_inference_engine\n\n        # Delete the reference using the engine's reference loader\n        engine.delete_reference(reference_id)\n\n        response = DeleteReferenceResponse(\n            success=True,\n            message=f\"Reference voice '{reference_id}' deleted successfully\",\n            reference_id=reference_id,\n        )\n        return format_response(response)\n\n    except FileNotFoundError as e:\n        logger.warning(f\"Reference ID '{reference_id}' not found: {e}\")\n        response = DeleteReferenceResponse(\n            success=False,\n            message=f\"Reference ID '{reference_id}' not found\",\n            reference_id=reference_id,\n        )\n        return format_response(response, status_code=404)  # Not Found\n\n    except ValueError as e:\n        logger.warning(f\"Invalid input for reference '{reference_id}': {e}\")\n        response = DeleteReferenceResponse(\n            success=False, message=str(e), reference_id=reference_id\n        )\n        return format_response(response, status_code=400)\n\n    except OSError as e:\n        logger.error(f\"File system error deleting reference '{reference_id}': {e}\")\n        response = DeleteReferenceResponse(\n            success=False,\n            message=\"File system error occurred\",\n            reference_id=reference_id,\n        )\n        return format_response(response, status_code=500)\n\n    except Exception as e:\n        logger.error(\n            f\"Unexpected error deleting reference '{reference_id}': {e}\", exc_info=True\n        )\n        response = DeleteReferenceResponse(\n            success=False,\n            message=\"Internal server error occurred\",\n            reference_id=reference_id,\n        )\n        return format_response(response, status_code=500)\n\n\n@routes.http.post(\"/v1/references/update\")\nasync def update_reference(\n    old_reference_id: str = Body(...), new_reference_id: str = Body(...)\n):\n    \"\"\"\n    Rename a reference voice directory from old_reference_id to new_reference_id.\n    \"\"\"\n    try:\n        # Validate input parameters\n        if not old_reference_id or not old_reference_id.strip():\n            raise ValueError(\"Old reference ID cannot be empty\")\n        if not new_reference_id or not new_reference_id.strip():\n            raise ValueError(\"New reference ID cannot be empty\")\n        if old_reference_id == new_reference_id:\n            raise ValueError(\"New reference ID must be different from old reference ID\")\n\n        # Validate ID format per ReferenceLoader rules\n        id_pattern = r\"^[a-zA-Z0-9\\-_ ]+$\"\n        if not re.match(id_pattern, new_reference_id) or len(new_reference_id) > 255:\n            raise ValueError(\n                \"New reference ID contains invalid characters or is too long\"\n            )\n\n        # Access engine to update caches after renaming\n        app_state = request.app.state\n        model_manager: ModelManager = app_state.model_manager\n        engine = model_manager.tts_inference_engine\n\n        refs_base = Path(\"references\")\n        old_dir = refs_base / old_reference_id\n        new_dir = refs_base / new_reference_id\n\n        # Existence checks\n        if not old_dir.exists() or not old_dir.is_dir():\n            raise FileNotFoundError(f\"Reference ID '{old_reference_id}' not found\")\n        if new_dir.exists():\n            # Conflict: destination already exists\n            response = UpdateReferenceResponse(\n                success=False,\n                message=f\"Reference ID '{new_reference_id}' already exists\",\n                old_reference_id=old_reference_id,\n                new_reference_id=new_reference_id,\n            )\n            return format_response(response, status_code=409)\n\n        # Perform rename\n        old_dir.rename(new_dir)\n\n        # Update in-memory cache key if present\n        if old_reference_id in engine.ref_by_id:\n            engine.ref_by_id[new_reference_id] = engine.ref_by_id.pop(old_reference_id)\n\n        response = UpdateReferenceResponse(\n            success=True,\n            message=(\n                f\"Reference voice renamed from '{old_reference_id}' to '{new_reference_id}' successfully\"\n            ),\n            old_reference_id=old_reference_id,\n            new_reference_id=new_reference_id,\n        )\n        return format_response(response)\n\n    except FileNotFoundError as e:\n        logger.warning(str(e))\n        response = UpdateReferenceResponse(\n            success=False,\n            message=str(e),\n            old_reference_id=old_reference_id,\n            new_reference_id=new_reference_id,\n        )\n        return format_response(response, status_code=404)\n\n    except ValueError as e:\n        logger.warning(f\"Invalid input for update reference: {e}\")\n        response = UpdateReferenceResponse(\n            success=False,\n            message=str(e),\n            old_reference_id=old_reference_id if \"old_reference_id\" in locals() else \"\",\n            new_reference_id=new_reference_id if \"new_reference_id\" in locals() else \"\",\n        )\n        return format_response(response, status_code=400)\n\n    except OSError as e:\n        logger.error(f\"File system error renaming reference: {e}\")\n        response = UpdateReferenceResponse(\n            success=False,\n            message=\"File system error occurred\",\n            old_reference_id=old_reference_id,\n            new_reference_id=new_reference_id,\n        )\n        return format_response(response, status_code=500)\n\n    except Exception as e:\n        logger.error(f\"Unexpected error updating reference: {e}\", exc_info=True)\n        response = UpdateReferenceResponse(\n            success=False,\n            message=\"Internal server error occurred\",\n            old_reference_id=old_reference_id if \"old_reference_id\" in locals() else \"\",\n            new_reference_id=new_reference_id if \"new_reference_id\" in locals() else \"\",\n        )\n        return format_response(response, status_code=500)\n"
  },
  {
    "path": "tools/vqgan/create_train_split.py",
    "content": "import math\nfrom pathlib import Path\nfrom random import Random\n\nimport click\nfrom loguru import logger\nfrom pydub import AudioSegment\nfrom tqdm import tqdm\n\nfrom fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist\n\n\n@click.command()\n@click.argument(\"root\", type=click.Path(exists=True, path_type=Path))\n@click.option(\"--val-ratio\", type=float, default=None)\n@click.option(\"--val-count\", type=int, default=None)\n@click.option(\"--filelist\", default=None, type=Path)\n@click.option(\"--min-duration\", default=None, type=float)\n@click.option(\"--max-duration\", default=None, type=float)\ndef main(root, val_ratio, val_count, filelist, min_duration, max_duration):\n    if filelist:\n        files = [i[0] for i in load_filelist(filelist)]\n    else:\n        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)\n\n    if min_duration is None and max_duration is None:\n        filtered_files = list(map(str, [file.relative_to(root) for file in files]))\n    else:\n        filtered_files = []\n        for file in tqdm(files):\n            try:\n                audio = AudioSegment.from_file(str(file))\n                duration = len(audio) / 1000.0\n\n                if min_duration is not None and duration < min_duration:\n                    logger.info(\n                        f\"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}\"\n                    )\n                    continue\n\n                if max_duration is not None and duration > max_duration:\n                    logger.info(\n                        f\"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}\"\n                    )\n                    continue\n\n                filtered_files.append(str(file.relative_to(root)))\n            except Exception as e:\n                logger.info(f\"Error processing {file}: {e}\")\n\n    logger.info(\n        f\"Found {len(files)} files, remaining {len(filtered_files)} files after filtering\"\n    )\n\n    Random(42).shuffle(filtered_files)\n\n    if val_count is None and val_ratio is None:\n        logger.info(\"Validation ratio and count not specified, using min(20%, 100)\")\n        val_size = min(100, math.ceil(len(filtered_files) * 0.2))\n    elif val_count is not None and val_ratio is not None:\n        logger.error(\"Cannot specify both val_count and val_ratio\")\n        return\n    elif val_count is not None:\n        if val_count < 1 or val_count > len(filtered_files):\n            logger.error(\"val_count must be between 1 and number of files\")\n            return\n        val_size = val_count\n    else:\n        val_size = math.ceil(len(filtered_files) * val_ratio)\n\n    logger.info(f\"Using {val_size} files for validation\")\n\n    with open(root / \"vq_train_filelist.txt\", \"w\", encoding=\"utf-8\") as f:\n        f.write(\"\\n\".join(filtered_files[val_size:]))\n\n    with open(root / \"vq_val_filelist.txt\", \"w\", encoding=\"utf-8\") as f:\n        f.write(\"\\n\".join(filtered_files[:val_size]))\n\n    logger.info(\"Done\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/vqgan/extract_vq.py",
    "content": "import os\nimport subprocess as sp\nimport sys\nimport time\nfrom datetime import timedelta\nfrom functools import lru_cache\nfrom pathlib import Path\nfrom random import Random\n\nimport click\nimport numpy as np\nimport torch\nimport torchaudio\nfrom hydra import compose, initialize\nfrom hydra.utils import instantiate\nfrom loguru import logger\nfrom omegaconf import OmegaConf\n\nfrom fish_speech.utils.file import AUDIO_EXTENSIONS, list_files, load_filelist\n\n# register eval resolver\nOmegaConf.register_new_resolver(\"eval\", eval)\n# This file is used to convert the audio files to text files using the Whisper model.\n# It's mainly used to generate the training data for the VQ model.\n\nbackends = torchaudio.list_audio_backends()\n\nif \"ffmpeg\" in backends:\n    backend = \"ffmpeg\"\nelse:\n    backend = \"soundfile\"\n\nRANK = int(os.environ.get(\"SLURM_PROCID\", 0))\nWORLD_SIZE = int(os.environ.get(\"SLURM_NTASKS\", 1))\n\nlogger_format = (\n    \"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | \"\n    \"<level>{level: <8}</level> | \"\n    \"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | \"\n    \"{extra[rank]} - <level>{message}</level>\"\n)\nlogger.configure(extra={\"rank\": f\"RANK: {RANK} / {WORLD_SIZE}\"})\nlogger.remove()\nlogger.add(sys.stderr, format=logger_format)\n\n\n@lru_cache(maxsize=1)\ndef get_model(\n    config_name: str = \"modded_dac_vq\",\n    checkpoint_path: str = \"checkpoints/openaudio-s1-mini/codec.pth\",\n    device: str | torch.device = \"cuda\",\n):\n    with initialize(version_base=\"1.3\", config_path=\"../../fish_speech/configs\"):\n        cfg = compose(config_name=config_name)\n\n    model = instantiate(cfg)\n    state_dict = torch.load(\n        checkpoint_path,\n        map_location=device,\n    )\n    if \"state_dict\" in state_dict:\n        state_dict = state_dict[\"state_dict\"]\n\n    if any(\"generator\" in k for k in state_dict):\n        state_dict = {\n            k.replace(\"generator.\", \"\"): v\n            for k, v in state_dict.items()\n            if \"generator.\" in k\n        }\n\n    model.load_state_dict(state_dict, strict=False)\n    model.eval()\n    model.to(device)\n\n    logger.info(f\"Loaded model\")\n    return model\n\n\n@torch.inference_mode()\ndef process_batch(files: list[Path], model) -> float:\n    wavs = []\n    audio_lengths = []\n    new_files = []\n    max_length = total_time = 0\n\n    for file in files:\n        try:\n            wav, sr = torchaudio.load(\n                str(file), backend=backend\n            )  # Need to install libsox-dev\n        except Exception as e:\n            logger.error(f\"Error reading {file}: {e}\")\n            continue\n\n        if wav.shape[0] > 1:\n            wav = wav.mean(dim=0, keepdim=True)\n\n        wav = torchaudio.functional.resample(wav.cuda(), sr, model.sample_rate)[0]\n        total_time += len(wav) / model.sample_rate\n        max_length = max(max_length, len(wav))\n\n        wavs.append(wav)\n        audio_lengths.append(len(wav))\n        new_files.append(file)\n\n    files = new_files\n\n    # Pad to max length\n    for i, wav in enumerate(wavs):\n        wavs[i] = torch.nn.functional.pad(wav, (0, max_length - len(wav)), \"constant\")\n\n    audios = torch.stack(wavs, dim=0)[:, None]\n    audio_lengths = torch.tensor(audio_lengths, device=model.device, dtype=torch.long)\n\n    # Calculate lengths\n    indices, feature_lengths = model.encode(audios, audio_lengths)\n\n    # Save to disk\n    outputs = indices.cpu().numpy()\n\n    for file, length, feature, audio_length in zip(\n        files, feature_lengths, outputs, audio_lengths\n    ):\n        feature = feature[:, :length]\n\n        # (T,)\n        with open(file.with_suffix(\".npy\"), \"wb\") as f:\n            np.save(f, feature)\n\n    return total_time\n\n\n@click.command()\n@click.argument(\"folder\")\n@click.option(\"--num-workers\", default=1)\n@click.option(\"--config-name\", default=\"modded_dac_vq\")\n@click.option(\n    \"--checkpoint-path\",\n    default=\"checkpoints/s2-pro/codec.pth\",\n)\n@click.option(\"--batch-size\", default=64)\n@click.option(\"--filelist\", default=None, type=Path)\ndef main(\n    folder: str,\n    num_workers: int,\n    config_name: str,\n    checkpoint_path: str,\n    batch_size: int,\n    filelist: Path,\n):\n    if num_workers > 1 and WORLD_SIZE != num_workers:\n        assert WORLD_SIZE == 1, \"You should either use SLURM or this launcher, not both\"\n\n        logger.info(f\"Spawning {num_workers} workers\")\n\n        if torch.cuda.is_available():\n            visible_devices = os.environ.get(\"CUDA_VISIBLE_DEVICES\", None)\n            if visible_devices is None:\n                visible_devices = list(range(torch.cuda.device_count()))\n            else:\n                visible_devices = visible_devices.split(\",\")\n        else:\n            # Set to empty string to avoid using GPU\n            visible_devices = [\"\"]\n\n        processes = []\n        for i in range(num_workers):\n            env = os.environ.copy()\n            env[\"CUDA_VISIBLE_DEVICES\"] = str(visible_devices[i % len(visible_devices)])\n            env[\"SLURM_PROCID\"] = str(i)\n            env[\"SLURM_NTASKS\"] = str(num_workers)\n\n            processes.append(\n                sp.Popen(\n                    [sys.executable] + sys.argv.copy(),\n                    env=env,\n                )\n            )\n\n        for p in processes:\n            p.wait()\n\n        logger.info(f\"All workers finished\")\n        return\n\n    # This is a worker\n    logger.info(f\"Starting worker\")\n    if filelist:\n        files = [i[0] for i in load_filelist(filelist)]\n    else:\n        files = list_files(folder, AUDIO_EXTENSIONS, recursive=True, sort=False)\n\n    print(f\"Found {len(files)} files\")\n    files = [Path(f) for f in files if not Path(f).with_suffix(\".npy\").exists()]\n\n    total_files = len(files)\n    files = files[RANK::WORLD_SIZE]\n    logger.info(f\"Processing {len(files)}/{total_files} files\")\n\n    # Batch processing\n    total_time = 0\n    begin_time = time.time()\n    processed_files = 0\n    model = get_model(config_name, checkpoint_path)\n\n    for n_batch, idx in enumerate(range(0, len(files), batch_size)):\n        batch = files[idx : idx + batch_size]\n        batch_time = process_batch(batch, model)\n\n        total_time += batch_time\n        processed_files += len(batch)\n\n        if (n_batch + 1) % 10 == 0:\n            eta = (\n                (time.time() - begin_time)\n                / processed_files\n                * (len(files) - processed_files)\n            )\n            logger.info(\n                f\"Processed {processed_files} files, {total_time / 3600:.2f} hours of audio, \"\n                + f\"ETA: {timedelta(seconds=round(eta))}s\"\n            )\n\n    logger.info(\n        f\"Finished processing {len(files)} files, {total_time / 3600:.2f} hours of audio\"\n    )\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "tools/webui/__init__.py",
    "content": "from typing import Callable\n\nimport gradio as gr\n\nfrom fish_speech.i18n import i18n\nfrom tools.webui.variables import HEADER_MD, TEXTBOX_PLACEHOLDER\n\n\ndef build_app(inference_fct: Callable, theme: str = \"light\") -> gr.Blocks:\n    with gr.Blocks(theme=gr.themes.Base()) as app:\n        gr.Markdown(HEADER_MD)\n\n        # Use light theme by default\n        app.load(\n            None,\n            None,\n            js=\"() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}\"\n            % theme,\n        )\n\n        # Inference\n        with gr.Row():\n            with gr.Column(scale=3):\n                text = gr.Textbox(\n                    label=i18n(\"Input Text\"), placeholder=TEXTBOX_PLACEHOLDER, lines=10\n                )\n\n                with gr.Row():\n                    with gr.Column():\n                        with gr.Tab(label=i18n(\"Advanced Config\")):\n                            with gr.Row():\n                                chunk_length = gr.Slider(\n                                    label=i18n(\"Iterative Prompt Length, 0 means off\"),\n                                    minimum=100,\n                                    maximum=400,\n                                    value=300,\n                                    step=8,\n                                )\n\n                                max_new_tokens = gr.Slider(\n                                    label=i18n(\n                                        \"Maximum tokens per batch, 0 means no limit\"\n                                    ),\n                                    minimum=0,\n                                    maximum=2048,\n                                    value=0,\n                                    step=8,\n                                )\n\n                            with gr.Row():\n                                top_p = gr.Slider(\n                                    label=\"Top-P\",\n                                    minimum=0.7,\n                                    maximum=0.95,\n                                    value=0.8,\n                                    step=0.01,\n                                )\n\n                                repetition_penalty = gr.Slider(\n                                    label=i18n(\"Repetition Penalty\"),\n                                    minimum=1,\n                                    maximum=1.2,\n                                    value=1.1,\n                                    step=0.01,\n                                )\n\n                            with gr.Row():\n                                temperature = gr.Slider(\n                                    label=\"Temperature\",\n                                    minimum=0.7,\n                                    maximum=1.0,\n                                    value=0.8,\n                                    step=0.01,\n                                )\n                                seed = gr.Number(\n                                    label=\"Seed\",\n                                    info=\"0 means randomized inference, otherwise deterministic\",\n                                    value=0,\n                                )\n\n                        with gr.Tab(label=i18n(\"Reference Audio\")):\n                            with gr.Row():\n                                gr.Markdown(\n                                    i18n(\n                                        \"5 to 10 seconds of reference audio, useful for specifying speaker.\"\n                                    )\n                                )\n                            with gr.Row():\n                                reference_id = gr.Textbox(\n                                    label=i18n(\"Reference ID\"),\n                                    placeholder=\"Leave empty to use uploaded references\",\n                                )\n\n                            with gr.Row():\n                                use_memory_cache = gr.Radio(\n                                    label=i18n(\"Use Memory Cache\"),\n                                    choices=[\"on\", \"off\"],\n                                    value=\"on\",\n                                )\n\n                            with gr.Row():\n                                reference_audio = gr.Audio(\n                                    label=i18n(\"Reference Audio\"),\n                                    type=\"filepath\",\n                                )\n                            with gr.Row():\n                                reference_text = gr.Textbox(\n                                    label=i18n(\"Reference Text\"),\n                                    lines=1,\n                                    placeholder=\"在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。\",\n                                    value=\"\",\n                                )\n\n            with gr.Column(scale=3):\n                with gr.Row():\n                    error = gr.HTML(\n                        label=i18n(\"Error Message\"),\n                        visible=True,\n                    )\n                with gr.Row():\n                    audio = gr.Audio(\n                        label=i18n(\"Generated Audio\"),\n                        type=\"numpy\",\n                        interactive=False,\n                        visible=True,\n                    )\n\n                with gr.Row():\n                    with gr.Column(scale=3):\n                        generate = gr.Button(\n                            value=\"\\U0001f3a7 \" + i18n(\"Generate\"),\n                            variant=\"primary\",\n                        )\n\n        # Submit\n        generate.click(\n            inference_fct,\n            [\n                text,\n                reference_id,\n                reference_audio,\n                reference_text,\n                max_new_tokens,\n                chunk_length,\n                top_p,\n                repetition_penalty,\n                temperature,\n                seed,\n                use_memory_cache,\n            ],\n            [audio, error],\n            concurrency_limit=1,\n        )\n\n    return app\n"
  },
  {
    "path": "tools/webui/inference.py",
    "content": "import html\nfrom functools import partial\nfrom typing import Any, Callable\n\nfrom fish_speech.i18n import i18n\nfrom fish_speech.utils.schema import ServeReferenceAudio, ServeTTSRequest\n\n\ndef inference_wrapper(\n    text,\n    reference_id,\n    reference_audio,\n    reference_text,\n    max_new_tokens,\n    chunk_length,\n    top_p,\n    repetition_penalty,\n    temperature,\n    seed,\n    use_memory_cache,\n    engine,\n):\n    \"\"\"\n    Wrapper for the inference function.\n    Used in the Gradio interface.\n    \"\"\"\n\n    if reference_audio:\n        references = get_reference_audio(reference_audio, reference_text)\n    else:\n        references = []\n\n    req = ServeTTSRequest(\n        text=text,\n        reference_id=reference_id if reference_id else None,\n        references=references,\n        max_new_tokens=max_new_tokens,\n        chunk_length=chunk_length,\n        top_p=top_p,\n        repetition_penalty=repetition_penalty,\n        temperature=temperature,\n        seed=int(seed) if seed else None,\n        use_memory_cache=use_memory_cache,\n    )\n\n    for result in engine.inference(req):\n        match result.code:\n            case \"final\":\n                return result.audio, None\n            case \"error\":\n                return None, build_html_error_message(i18n(result.error))\n            case _:\n                pass\n\n    return None, i18n(\"No audio generated\")\n\n\ndef get_reference_audio(reference_audio: str, reference_text: str) -> list:\n    \"\"\"\n    Get the reference audio bytes.\n    \"\"\"\n\n    with open(reference_audio, \"rb\") as audio_file:\n        audio_bytes = audio_file.read()\n\n    return [ServeReferenceAudio(audio=audio_bytes, text=reference_text)]\n\n\ndef build_html_error_message(error: Any) -> str:\n\n    error = error if isinstance(error, Exception) else Exception(\"Unknown error\")\n\n    return f\"\"\"\n    <div style=\"color: red; \n    font-weight: bold;\">\n        {html.escape(str(error))}\n    </div>\n    \"\"\"\n\n\ndef get_inference_wrapper(engine) -> Callable:\n    \"\"\"\n    Get the inference function with the immutable arguments.\n    \"\"\"\n\n    return partial(\n        inference_wrapper,\n        engine=engine,\n    )\n"
  },
  {
    "path": "tools/webui/variables.py",
    "content": "from fish_speech.i18n import i18n\n\nHEADER_MD = f\"\"\"# Fish Speech\n\n{i18n(\"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).\")}  \n\n{i18n(\"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1.5).\")}  \n\n{i18n(\"Related code and weights are released under FISH AUDIO RESEARCH LICENSE.\")}  \n\n{i18n(\"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.\")}  \n\"\"\"\n\nTEXTBOX_PLACEHOLDER = i18n(\"Put your text here.\")\n"
  }
]