Repository: fishaudio/fish-speech Branch: main Commit: 49985a34a704 Files: 153 Total size: 676.6 KB Directory structure: gitextract_ft5t8lt3/ ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ └── feature_request.yml │ ├── pull_request_template.md │ └── workflows/ │ ├── build-docker-image.yml │ ├── docs.yml │ └── stale.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .project-root ├── .readthedocs.yaml ├── API_FLAGS.txt ├── LICENSE ├── README.md ├── awesome_webui/ │ ├── .gitignore │ ├── README.md │ ├── eslint.config.js │ ├── index.html │ ├── package.json │ ├── src/ │ │ ├── App.tsx │ │ ├── components/ │ │ │ └── ui/ │ │ │ ├── alert.tsx │ │ │ ├── badge.tsx │ │ │ ├── button.tsx │ │ │ ├── card.tsx │ │ │ ├── collapsible.tsx │ │ │ ├── dialog.tsx │ │ │ ├── label.tsx │ │ │ ├── scroll-area.tsx │ │ │ ├── separator.tsx │ │ │ ├── slider.tsx │ │ │ ├── switch.tsx │ │ │ ├── textarea.tsx │ │ │ └── toggle-group.tsx │ │ ├── index.css │ │ └── main.tsx │ ├── tsconfig.app.json │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts ├── compose.base.yml ├── compose.yml ├── docker/ │ └── Dockerfile ├── dockerfile.dev ├── docs/ │ ├── CNAME │ ├── README.ar.md │ ├── README.ja.md │ ├── README.ko.md │ ├── README.pt-BR.md │ ├── README.zh.md │ ├── ar/ │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ └── install.md │ ├── en/ │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ ├── install.md │ │ └── server.md │ ├── ja/ │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ └── install.md │ ├── ko/ │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ └── install.md │ ├── pt/ │ │ ├── finetune.md │ │ ├── index.md │ │ ├── inference.md │ │ └── install.md │ ├── requirements.txt │ ├── stylesheets/ │ │ └── extra.css │ └── zh/ │ ├── finetune.md │ ├── index.md │ ├── inference.md │ └── install.md ├── entrypoint.sh ├── fish_speech/ │ ├── callbacks/ │ │ ├── __init__.py │ │ └── grad_norm.py │ ├── configs/ │ │ ├── base.yaml │ │ ├── lora/ │ │ │ └── r_8_alpha_16.yaml │ │ ├── modded_dac_vq.yaml │ │ └── text2semantic_finetune.yaml │ ├── content_sequence.py │ ├── conversation.py │ ├── datasets/ │ │ ├── concat_repeat.py │ │ ├── protos/ │ │ │ ├── text-data.proto │ │ │ ├── text_data_pb2.py │ │ │ └── text_data_stream.py │ │ ├── semantic.py │ │ └── vqgan.py │ ├── i18n/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── core.py │ │ ├── locale/ │ │ │ ├── en_US.json │ │ │ ├── es_ES.json │ │ │ ├── ja_JP.json │ │ │ ├── ko_KR.json │ │ │ ├── pt_BR.json │ │ │ └── zh_CN.json │ │ └── scan.py │ ├── inference_engine/ │ │ ├── __init__.py │ │ ├── reference_loader.py │ │ ├── utils.py │ │ └── vq_manager.py │ ├── models/ │ │ ├── dac/ │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ ├── modded_dac.py │ │ │ └── rvq.py │ │ └── text2semantic/ │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── lit_module.py │ │ ├── llama.py │ │ └── lora.py │ ├── scheduler.py │ ├── text/ │ │ ├── __init__.py │ │ └── clean.py │ ├── tokenizer.py │ ├── train.py │ └── utils/ │ ├── __init__.py │ ├── braceexpand.py │ ├── context.py │ ├── file.py │ ├── instantiators.py │ ├── logger.py │ ├── logging_utils.py │ ├── rich_utils.py │ ├── schema.py │ ├── spectrogram.py │ └── utils.py ├── inference.ipynb ├── mkdocs.yml ├── pyproject.toml ├── pyrightconfig.json └── tools/ ├── api_client.py ├── api_server.py ├── llama/ │ ├── build_dataset.py │ ├── eval_in_context.py │ ├── merge_lora.py │ └── quantize.py ├── run_webui.py ├── server/ │ ├── api_utils.py │ ├── exception_handler.py │ ├── inference.py │ ├── model_manager.py │ ├── model_utils.py │ └── views.py ├── vqgan/ │ ├── create_train_split.py │ └── extract_vq.py └── webui/ ├── __init__.py ├── inference.py └── variables.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # .dockerignore # Git and version control .git .gitignore .gitattributes .gitmodules # IDE and editor files .vscode/ .idea/ *.swp *.swo *~ .DS_Store Thumbs.db # Python cache and build artifacts __pycache__/ *.py[cod] *$py.class *.so .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Virtual environments venv/ env/ ENV/ .venv/ .env/ # Testing .pytest_cache/ .coverage htmlcov/ .tox/ .nox/ coverage.xml *.cover .hypothesis/ # Jupyter Notebook .ipynb_checkpoints *.ipynb # Logs *.log logs/ # Temporary files tmp/ temp/ *.tmp *.temp # OS generated files .DS_Store .DS_Store? ._* .Spotlight-V100 .Trashes ehthumbs.db Thumbs.db # Docker files (except the one being used) docker/ Dockerfile* docker-compose*.yml .dockerignore # Checkpoints and models (should be mounted) checkpoints/ models/ *.pth *.ckpt *.safetensors *.bin # Reference voices (should be mounted) references/ # Generated audio files *.wav *.mp3 *.flac *.ogg generated_audio.wav fake.wav fake.npy # Cache directories .cache/ cache/ .uv_cache/ # Development files .env .env.local .env.development .env.test .env.production # Test files test_*.py *_test.py tests/ # CI/CD .github/ .gitlab-ci.yml .travis.yml .circleci/ azure-pipelines.yml # Monitoring and profiling .prof *.prof # Backup files *.bak *.backup *.old # Large data files *.csv *.jsonl *.parquet *.h5 *.hdf5 # Audio processing temporary files *.tmp.wav *.temp.wav # OLD: # .github # results # data # *.filelist # /data_server/target # checkpoints # .venv ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: "🕷️ Bug report" description: | Please follow this template carefully to ensure we can address your issue quickly. Make sure to provide as much detail as possible, including logs and screenshots. labels: - bug body: - type: checkboxes attributes: label: Self Checks description: "To ensure timely help, please confirm the following:" options: - label: This template is only for bug reports. For questions, please visit [Discussions](https://github.com/fishaudio/fish-speech/discussions). required: true - label: I have thoroughly reviewed the project documentation (installation, training, inference) but couldn't find information to solve my problem. [English](https://speech.fish.audio/) [中文](https://speech.fish.audio/zh/) [日本語](https://speech.fish.audio/ja/) [Portuguese (Brazil)](https://speech.fish.audio/pt/) required: true - label: I have searched for existing issues, including closed ones. [Search issues](https://github.com/fishaudio/fish-speech/issues) required: true - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)). required: true - label: "[FOR CHINESE USERS] 请务必使用英文提交 Issue,否则会被关闭。谢谢!:)" required: true - label: "Please do not modify this template and fill in all required fields." required: true - type: dropdown attributes: label: Cloud or Self Hosted multiple: true options: - Cloud - Self Hosted (Docker) - Self Hosted (Source) validations: required: true - type: textarea attributes: label: Environment Details description: "Provide details such as OS, Python version, and any relevant software or dependencies." placeholder: e.g., macOS 13.5, Python 3.10, torch==2.4.1, Gradio 4.44.0 validations: required: true - type: textarea attributes: label: Steps to Reproduce description: | Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks. placeholder: | 1. Run the command `python -m tools.api_client -t "xxxxx"` 2. Observe the console output error: `ModuleNotFoundError: No module named 'pyaudio'` (with screenshots or logs will be better) validations: required: true - type: textarea attributes: label: ✔️ Expected Behavior placeholder: Describe what you expected to happen. validations: required: false - type: textarea attributes: label: ❌ Actual Behavior placeholder: Describe what actually happened. validations: required: false ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: "\U0001F4E7 Discussions" url: https://github.com/fishaudio/fish-speech/discussions about: General discussions and request help from the community ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: "⭐ Feature or enhancement request" description: Propose something new. labels: - enhancement body: - type: checkboxes attributes: label: Self Checks description: "To make sure we get to you in time, please check the following :)" options: - label: I have thoroughly reviewed the project documentation (installation, training, inference) but couldn't find any relevant information that meets my needs. [English](https://speech.fish.audio/) [中文](https://speech.fish.audio/zh/) [日本語](https://speech.fish.audio/ja/) [Portuguese (Brazil)](https://speech.fish.audio/pt/) required: true - label: I have searched for existing issues [search for existing issues]([https://github.com/langgenius/dify/issues](https://github.com/fishaudio/fish-speech/issues)), including closed ones. required: true - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)). required: true - label: "[FOR CHINESE USERS] 请务必使用英文提交 Issue,否则会被关闭。谢谢!:)" required: true - label: "Please do not modify this template :) and fill in all the required fields." required: true - type: textarea attributes: label: 1. Is this request related to a challenge you're experiencing? Tell us your story. description: | Describe the specific problem or scenario you’re facing in detail. For example: *"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."* placeholder: Please describe the situation in as much detail as possible. validations: required: true - type: textarea attributes: label: 2. What is your suggested solution? description: | Provide a clear description of the feature or enhancement you'd like to propose. How would this feature solve your issue or improve the project? placeholder: Describe your idea or proposed solution here. validations: required: true - type: textarea attributes: label: 3. Additional context or comments description: | Any other relevant information, links, documents, or screenshots that provide clarity. Use this section for anything not covered above. placeholder: Add any extra details here. validations: required: false - type: checkboxes attributes: label: 4. Can you help us with this feature? description: | Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration. options: - label: I am interested in contributing to this feature. required: false - type: markdown attributes: value: | **Note:** Please submit only one request per issue to keep discussions focused and manageable. ================================================ FILE: .github/pull_request_template.md ================================================ **Is this PR adding new feature or fix a BUG?** Add feature / Fix BUG. **Is this pull request related to any issue? If yes, please link the issue.** #xxx ================================================ FILE: .github/workflows/build-docker-image.yml ================================================ name: Build Docker Images on: push: branches: - main tags: - "v*" jobs: build: runs-on: ubuntu-latest-16c64g strategy: matrix: target: [webui, server] backend: [cuda, cpu] steps: - uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Get Version run: | if [[ $GITHUB_REF == refs/tags/v* ]]; then version=$(basename ${GITHUB_REF}) else version=nightly fi echo "version=${version}" >> $GITHUB_ENV echo "Current version: ${version}" - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USER }} password: ${{ secrets.DOCKER_PAT }} - name: Set platform for CPU builds id: platform run: | if [ "${{ matrix.backend }}" = "cpu" ]; then echo "platforms=linux/amd64,linux/arm64" >> $GITHUB_OUTPUT else echo "platforms=linux/amd64" >> $GITHUB_OUTPUT fi - name: Build and Push ${{ matrix.target }}-${{ matrix.backend }} Image uses: docker/build-push-action@v6 with: context: . file: docker/Dockerfile platforms: ${{ steps.platform.outputs.platforms }} push: true target: ${{ matrix.target }} build-args: | BACKEND=${{ matrix.backend }} UV_EXTRA=${{ matrix.backend == 'cuda' && 'cu126' || 'cpu' }} tags: | fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }}-${{ env.version }} fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }} ${{ (matrix.target == 'webui' && matrix.backend == 'cuda') && format('fishaudio/fish-speech:{0}', env.version) || '' }} ${{ (matrix.target == 'webui' && matrix.backend == 'cuda') && 'fishaudio/fish-speech:latest' || '' }} outputs: type=image,oci-mediatypes=true,compression=zstd,compression-level=3,force-compression=true cache-from: type=registry,ref=fishaudio/fish-speech:${{ matrix.target }}-${{ matrix.backend }} cache-to: type=inline update-readme: runs-on: ubuntu-latest needs: build if: github.ref == 'refs/heads/main' steps: - name: Push README to Dockerhub uses: peter-evans/dockerhub-description@v4 with: username: ${{ secrets.DOCKER_USER }} password: ${{ secrets.DOCKER_PAT }} repository: fishaudio/fish-speech ================================================ FILE: .github/workflows/docs.yml ================================================ name: docs on: push: branches: - main paths: - 'docs/**' - 'mkdocs.yml' permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Configure Git Credentials run: | git config user.name github-actions[bot] git config user.email 41898282+github-actions[bot]@users.noreply.github.com - uses: actions/setup-python@v5 with: python-version: 3.x - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache restore-keys: | mkdocs-material- - run: pip install -r docs/requirements.txt - run: mkdocs gh-deploy --force ================================================ FILE: .github/workflows/stale.yml ================================================ name: Close inactive issues on: schedule: - cron: "0 0 * * *" jobs: close-issues: runs-on: ubuntu-latest permissions: issues: write pull-requests: write steps: - uses: actions/stale@v9 with: days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: 30 days-before-pr-close: 30 stale-pr-label: "stale" stale-pr-message: "This PR is stale because it has been open for 30 days with no activity." close-pr-message: "This PR was closed because it has been inactive for 30 days since being marked as stale." repo-token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ # ============================================================================= # Fish Speech - .gitignore # ============================================================================= # Operating System Files # ----------------------- .DS_Store .DS_Store? ._* .Spotlight-V100 .Trashes ehthumbs.db Thumbs.db # IDEs and Editors # ---------------- .vscode/ .idea/ *.swp *.swo *~ # Python # ------ __pycache__/ *.py[cod] *$py.class *.so .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # Virtual Environments # -------------------- .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ /fishenv/ # Project Dependencies # -------------------- .pdm-python /fish_speech.egg-info # Data and Model Files # -------------------- data/ results/ checkpoints/ references/ demo-audios/ example/ filelists/ *.filelist # Audio Files # ----------- *.wav *.mp3 *.flac *.ogg *.m4a # Data Files # ---------- *.npy *.npz *.pkl *.pickle *.lab /fish_speech/text/cmudict_cache.pickle # Cache and Temporary Files # -------------------------- /.cache/ /.gradio/ /.locale/ .pgx.* *log *.log site/ # External Tools # -------------- ffmpeg.exe ffprobe.exe /faster_whisper/ # Server Related # -------------- /data_server/target/ # Test Files # ---------- /*.test.sh asr-label* ================================================ FILE: .pre-commit-config.yaml ================================================ ci: autoupdate_schedule: monthly repos: - repo: https://github.com/pycqa/isort rev: 8.0.1 hooks: - id: isort args: [--profile=black] - repo: https://github.com/psf/black-pre-commit-mirror rev: 26.1.0 hooks: - id: black - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: end-of-file-fixer - id: check-yaml - id: check-json - id: mixed-line-ending args: ["--fix=lf"] - id: check-added-large-files args: ["--maxkb=5000"] ================================================ FILE: .project-root ================================================ ================================================ FILE: .readthedocs.yaml ================================================ # Read the Docs configuration file for MkDocs projects # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: python: "3.12" mkdocs: configuration: mkdocs.yml # Optionally declare the Python requirements required to build your docs python: install: - requirements: docs/requirements.txt ================================================ FILE: API_FLAGS.txt ================================================ # --infer --api --listen 0.0.0.0:8080 \ --llama-checkpoint-path "checkpoints/openaudio-s1-mini" \ --decoder-checkpoint-path "checkpoints/openaudio-s1-mini/codec.pth" \ --decoder-config-name modded_dac_vq ================================================ FILE: LICENSE ================================================ # FISH AUDIO RESEARCH LICENSE AGREEMENT **Last Updated: March 7, 2026** ## I. INTRODUCTION This Agreement applies to any individual person or entity ("You", "Your" or "Licensee") that uses or distributes any portion or element of the Fish Audio Materials or Derivative Works thereof for any Research, Non-Commercial, or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below. This Agreement is intended to allow research and non-commercial uses of the Materials free of charge. Any Commercial use of the Materials requires a separate license from Fish Audio. By clicking "I Accept" or by using, distributing, or accessing any portion or element of the Fish Audio Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then "You" includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity's behalf. ## II. RESEARCH & NON-COMMERCIAL USE LICENSE Subject to the terms of this Agreement, Fish Audio grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Fish Audio's intellectual property or other rights owned by Fish Audio embodied in the Fish Audio Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Fish Audio Materials for any Research or Non-Commercial Purpose. "Research Purpose" means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. "Non-Commercial Purpose" means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing. ## III. COMMERCIAL USE **Any use of the Fish Audio Materials or Derivative Works for a Commercial Purpose requires a separate written license agreement from Fish Audio.** No commercial rights are granted under this Agreement. "Commercial Purpose" means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for or directed toward commercial advantage or monetary compensation to You or others, including but not limited to: (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, (ii) Your business's or organization's internal operations, and (iii) any use in connection with a product or service for which You charge a fee or generate revenue, whether directly or indirectly. To obtain a commercial license, please contact Fish Audio at: - **Website:** [https://fish.audio](https://fish.audio) - **Email:** business@fish.audio ## IV. GENERAL TERMS Your Research and Non-Commercial License under this Agreement is subject to the following terms. ### a. Distribution & Attribution If You distribute or make available the Fish Audio Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This model is licensed under the Fish Audio Research License, Copyright © 39 AI, INC. All Rights Reserved.", and (iii) prominently display "Built with Fish Audio" on a related website, user interface, blogpost, about page, or product documentation. If You create a Derivative Work, You may add your own attribution notice(s) to the "Notice" text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Fish Audio Materials and state in the "Notice" text file that You changed the Fish Audio Materials and how it was modified. ### b. Use Restrictions Your use of the Fish Audio Materials and Derivative Works, including any output or results of the Fish Audio Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to Fish Audio's Acceptable Use Policy, which is hereby incorporated by reference. Furthermore, You will not use the Fish Audio Materials or Derivative Works, or any output or results of the Fish Audio Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works). ### c. Intellectual Property **(i) Trademark License.** No trademark licenses are granted under this Agreement, and in connection with the Fish Audio Materials or Derivative Works, You may not use any name or mark owned by or associated with Fish Audio or any of its Affiliates, except as required under Section IV(a) herein. **(ii) Ownership of Derivative Works.** As between You and Fish Audio, You are the owner of Derivative Works You create, subject to Fish Audio's ownership of the Fish Audio Materials and any Derivative Works made by or for Fish Audio. **(iii) Ownership of Outputs.** As between You and Fish Audio, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law. **(iv) Disputes.** If You or Your Affiliate(s) institute litigation or other proceedings against Fish Audio (including a cross-claim or counterclaim in a lawsuit) alleging that the Fish Audio Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Fish Audio from and against any claim by any third party arising out of or related to Your use or distribution of the Fish Audio Materials or Derivative Works in violation of this Agreement. **(v) Feedback.** From time to time, You may provide Fish Audio with verbal and/or written suggestions, comments or other feedback related to Fish Audio's existing or prospective technology, products or services (collectively, "Feedback"). You are not obligated to provide Fish Audio with Feedback, but to the extent that You do, You hereby grant Fish Audio a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided "AS IS" and You make no warranties whatsoever about any Feedback. ### d. Disclaimer of Warranty UNLESS REQUIRED BY APPLICABLE LAW, THE FISH AUDIO MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE FISH AUDIO MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE FISH AUDIO MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. ### e. Limitation of Liability IN NO EVENT WILL FISH AUDIO OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF FISH AUDIO OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. ### f. Term and Termination The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Fish Audio Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Fish Audio may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Fish Audio Materials or Derivative Works. Sections IV(d), (e), and (g) shall survive the termination of this Agreement. ### g. Governing Law This Agreement will be governed by and construed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement. ## V. DEFINITIONS **"Affiliate(s)"** means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, "control" means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity. **"Agreement"** means this Fish Audio Research License Agreement. **"Derivative Work(s)"** means (a) any derivative work of the Fish Audio Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model's output, including "fine tune" and "low-rank adaptation" models derived from a Model or a Model's output, but do not include the output of any Model. **"Documentation"** means any specifications, manuals, documentation, and other written information provided by Fish Audio related to the Software or Models. **"Fish Audio"** or **"we"** means 39 AI, INC. and its Affiliates. **"Model(s)"** means, collectively, Fish Audio's proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing. **"Software"** means Fish Audio's proprietary software made available under this Agreement now or in the future. **"Fish Audio Materials"** means, collectively, Fish Audio's proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement. **"Trade Control Laws"** means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations. ================================================ FILE: README.md ================================================

Fish Speech

**English** | [简体中文](docs/README.zh.md) | [Portuguese](docs/README.pt-BR.md) | [日本語](docs/README.ja.md) | [한국어](docs/README.ko.md) | [العربية](docs/README.ar.md)
Fish Audio S1 - Expressive Voice Cloning and Text-to-Speech | Product Hunt fishaudio%2Ffish-speech | Trendshift



Discord Docker QQ Channel
HuggingFace Model Fish Audio Blog Paper | Technical Report
> [!IMPORTANT] > **License Notice** > This codebase and its associated model weights are released under **[FISH AUDIO RESEARCH LICENSE](LICENSE)**. Please refer to [LICENSE](LICENSE) for more details. We will take action against any violation of the license. > [!WARNING] > **Legal Disclaimer** > We do not hold any responsibility for any illegal usage of the codebase. Please refer to your local laws about DMCA and other related laws. ## Quick Start ### For Human Here are the official documents for Fish Audio S2, follow the instructions to get started easily. - [Installation](https://speech.fish.audio/install/) - [Command Line Inference](https://speech.fish.audio/inference/#command-line-inference) - [WebUI Inference](https://speech.fish.audio/inference/#webui-inference) - [Server Inference](https://speech.fish.audio/server/) - [Docker Setup](https://speech.fish.audio/install/#docker-setup) > [!IMPORTANT] > **For SGLang server, please read [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md).** ### For LLM Agent ``` Install and configure Fish-Audio S2 by following the instructions here: https://speech.fish.audio/install/ ``` ## Fish Audio S2 Pro **State-of-the-art multilingual text-to-speech (TTS) system, redefining the boundaries of voice generation.** Fish Audio S2 Pro is the most advanced multimodal model developed by [Fish Audio](https://fish.audio/). Trained on over **10 million hours** of audio data covering more than **80 languages**, S2 Pro combines a **Dual-Autoregressive (Dual-AR)** architecture with reinforcement learning (RL) alignment to generate speech that is exceptionally natural, realistic, and emotionally rich, leading the competition among both open-source and closed-source systems. The core strength of S2 Pro lies in its support for **sub-word level** fine-grained control of prosody and emotion using natural language tags (e.g., `[whisper]`, `[excited]`, `[angry]`), while natively supporting multi-speaker and multi-turn conversation generation. Visit the [Fish Audio website](https://fish.audio/) for a live playground, or read our [technical report](https://arxiv.org/abs/2603.08823) and [blog post](https://fish.audio/blog/fish-audio-open-sources-s2/) for more details. ### Model Variants | Model | Size | Availability | Description | |------|------|-------------|-------------| | S2-Pro | 4B parameters | [HuggingFace](https://huggingface.co/fishaudio/s2-pro) | Full-featured flagship model with maximum quality and stability | More details of the model can be found in the [technical report](https://arxiv.org/abs/2411.01156). ## Benchmark Results | Benchmark | Fish Audio S2 | |------|------| | Seed-TTS Eval — WER (Chinese) | **0.54%** (best overall) | | Seed-TTS Eval — WER (English) | **0.99%** (best overall) | | Audio Turing Test (with instruction) | **0.515** posterior mean | | EmergentTTS-Eval — Win Rate | **81.88%** (highest overall) | | Fish Instruction Benchmark — TAR | **93.3%** | | Fish Instruction Benchmark — Quality | **4.51 / 5.0** | | Multilingual (MiniMax Testset) — Best WER | **11 of 24** languages | | Multilingual (MiniMax Testset) — Best SIM | **17 of 24** languages | On Seed-TTS Eval, S2 achieves the lowest WER among all evaluated models including closed-source systems: Qwen3-TTS (0.77/1.24), MiniMax Speech-02 (0.99/1.90), Seed-TTS (1.12/2.25). On the Audio Turing Test, 0.515 surpasses Seed-TTS (0.417) by 24% and MiniMax-Speech (0.387) by 33%. On EmergentTTS-Eval, S2 achieves particularly strong results in paralinguistics (91.61% win rate), questions (84.41%), and syntactic complexity (83.39%). ## Highlights ### Fine-Grained Inline Control via Natural Language S2 Pro brings unprecedented "soul" to speech. Using simple `[tag]` syntax, you can precisely embed emotional instructions at any position in the text. - **15,000+ Unique Tags Supported**: Not limited to fixed presets; S2 supports **free-form text descriptions**. Try `[whisper in small voice]`, `[professional broadcast tone]`, or `[pitch up]`. - **Rich Emotion Library**: `[pause]` `[emphasis]` `[laughing]` `[inhale]` `[chuckle]` `[tsk]` `[singing]` `[excited]` `[laughing tone]` `[interrupting]` `[chuckling]` `[excited tone]` `[volume up]` `[echo]` `[angry]` `[low volume]` `[sigh]` `[low voice]` `[whisper]` `[screaming]` `[shouting]` `[loud]` `[surprised]` `[short pause]` `[exhale]` `[delight]` `[panting]` `[audience laughter]` `[with strong accent]` `[volume down]` `[clearing throat]` `[sad]` `[moaning]` `[shocked]` ### Innovative Dual-Autoregressive (Dual-AR) Architecture S2 Pro adopts a master-slave Dual-AR architecture consisting of a decoder-only transformer and an RVQ audio codec (10 codebooks, ~21 Hz): - **Slow AR (4B parameters)**: Operates along the time axis, predicting the primary semantic codebook. - **Fast AR (400M parameters)**: Generates the remaining 9 residual codebooks at each time step, reconstructing exquisite acoustic details. This asymmetric design achieves peak audio fidelity while significantly boosting inference speed. ### Reinforcement Learning (RL) Alignment S2 Pro utilizes **Group Relative Policy Optimization (GRPO)** for post-training alignment. We use the same model suite for data cleaning and annotation directly as Reward Models, perfectly resolving the distribution mismatch between pre-training data and post-training objectives. - **Multi-Dimensional Reward Signals**: Comprehensively evaluates semantic accuracy, instruction adherence, acoustic preference scoring, and timbre similarity to ensure every second of generated speech feels intuitive to humans. ### Extreme Streaming Performance (Powered by SGLang) As the Dual-AR architecture is structurally isomorphic to standard LLMs, S2 Pro natively supports all SGLang inference acceleration features, including Continuous Batching, Paged KV Cache, CUDA Graph, and RadixAttention-based Prefix Caching. **Performance on a single NVIDIA H200 GPU:** - **Real-Time Factor (RTF)**: 0.195 - **Time-to-First-Audio (TTFA)**: ~100 ms - **Extreme Throughput**: 3,000+ acoustic tokens/s while maintaining RTF < 0.5 ### Robust Multilingual Support S2 Pro supports over 80 languages without requiring phonemes or language-specific preprocessing: - **Tier 1**: Japanese (ja), English (en), Chinese (zh) - **Tier 2**: Korean (ko), Spanish (es), Portuguese (pt), Arabic (ar), Russian (ru), French (fr), German (de) - **Global Coverage**: sv, it, tr, no, nl, cy, eu, ca, da, gl, ta, hu, fi, pl, et, hi, la, ur, th, vi, jw, bn, yo, xsl, cs, sw, nn, he, ms, uk, id, kk, bg, lv, my, tl, sk, ne, fa, af, el, bo, hr, ro, sn, mi, yi, am, be, km, is, az, sd, br, sq, ps, mn, ht, ml, sr, sa, te, ka, bs, pa, lt, kn, si, hy, mr, as, gu, fo, etc. ### Native Multi-Speaker Generation Fish Audio S2 allows users to upload reference audio containing multiple speakers, and the model processes each speaker's features via the `<|speaker:i|>` token. You can then control the model's performance via speaker ID tokens, enabling a single generation to include multiple speakers. There is no longer a need to upload separate reference audio for each individual speaker. ### Multi-Turn Generation Thanks to the expansion of the model context, our model can now leverage previous information to improve the expressiveness of subsequent generated content, thereby increasing the naturalness of the dialogue. ### Rapid Voice Cloning Fish Audio S2 supports accurate voice cloning using short reference samples (typically 10-30 seconds). The model captures timbre, speaking style, and emotional tendencies, producing realistic and consistent cloned voices without additional fine-tuning. For SGLang Server usage, please refer to the [SGLang-Omni README](https://github.com/sgl-project/sglang-omni/blob/main/sglang_omni/models/fishaudio_s2_pro/README.md). --- ## Credits - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2) - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) - [GPT VITS](https://github.com/innnky/gpt-vits) - [MQTTS](https://github.com/b04901014/MQTTS) - [GPT Fast](https://github.com/pytorch-labs/gpt-fast) - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS) - [Qwen3](https://github.com/QwenLM/Qwen3) ## Tech Report ```bibtex @misc{fish-speech-v1.4, title={Fish-Speech: Leveraging Large Language Models for Advanced Multilingual Text-to-Speech Synthesis}, author={Shijia Liao and Yuxuan Wang and Tianyu Li and Yifan Cheng and Ruoyi Zhang and Rongzhi Zhou and Yijin Xing}, year={2024}, eprint={2411.01156}, archivePrefix={arXiv}, primaryClass={cs.SD}, url={https://arxiv.org/abs/2411.01156}, } @misc{liao2026fishaudios2technical, title={Fish Audio S2 Technical Report}, author={Shijia Liao and Yuxuan Wang and Songting Liu and Yifan Cheng and Ruoyi Zhang and Tianyu Li and Shidong Li and Yisheng Zheng and Xingwei Liu and Qingzheng Wang and Zhizhuo Zhou and Jiahua Liu and Xin Chen and Dawei Han}, year={2026}, eprint={2603.08823}, archivePrefix={arXiv}, primaryClass={cs.SD}, url={https://arxiv.org/abs/2603.08823}, } ``` ================================================ FILE: awesome_webui/.gitignore ================================================ # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* pnpm-debug.log* lerna-debug.log* node_modules dist dist-ssr *.local # Editor directories and files .vscode/* !.vscode/extensions.json .idea .DS_Store *.suo *.ntvs* *.njsproj *.sln *.sw? ================================================ FILE: awesome_webui/README.md ================================================ # React + TypeScript + Vite This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules. Currently, two official plugins are available: - [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Babel](https://babeljs.io/) (or [oxc](https://oxc.rs) when used in [rolldown-vite](https://vite.dev/guide/rolldown)) for Fast Refresh - [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/) for Fast Refresh ## React Compiler The React Compiler is currently not compatible with SWC. See [this issue](https://github.com/vitejs/vite-plugin-react/issues/428) for tracking the progress. ## Expanding the ESLint configuration If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules: ```js export default defineConfig([ globalIgnores(['dist']), { files: ['**/*.{ts,tsx}'], extends: [ // Other configs... // Remove tseslint.configs.recommended and replace with this tseslint.configs.recommendedTypeChecked, // Alternatively, use this for stricter rules tseslint.configs.strictTypeChecked, // Optionally, add this for stylistic rules tseslint.configs.stylisticTypeChecked, // Other configs... ], languageOptions: { parserOptions: { project: ['./tsconfig.node.json', './tsconfig.app.json'], tsconfigRootDir: import.meta.dirname, }, // other options... }, }, ]) ``` You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules: ```js // eslint.config.js import reactX from 'eslint-plugin-react-x' import reactDom from 'eslint-plugin-react-dom' export default defineConfig([ globalIgnores(['dist']), { files: ['**/*.{ts,tsx}'], extends: [ // Other configs... // Enable lint rules for React reactX.configs['recommended-typescript'], // Enable lint rules for React DOM reactDom.configs.recommended, ], languageOptions: { parserOptions: { project: ['./tsconfig.node.json', './tsconfig.app.json'], tsconfigRootDir: import.meta.dirname, }, // other options... }, }, ]) ``` ================================================ FILE: awesome_webui/eslint.config.js ================================================ import js from '@eslint/js' import globals from 'globals' import reactHooks from 'eslint-plugin-react-hooks' import reactRefresh from 'eslint-plugin-react-refresh' import tseslint from 'typescript-eslint' import { defineConfig, globalIgnores } from 'eslint/config' export default defineConfig([ globalIgnores(['dist']), { files: ['**/*.{ts,tsx}'], extends: [ js.configs.recommended, tseslint.configs.recommended, reactHooks.configs.flat.recommended, reactRefresh.configs.vite, ], languageOptions: { ecmaVersion: 2020, globals: globals.browser, }, }, ]) ================================================ FILE: awesome_webui/index.html ================================================ Awesome WebUI
================================================ FILE: awesome_webui/package.json ================================================ { "name": "awesome_webui", "private": true, "version": "0.0.0", "type": "module", "scripts": { "dev": "vite", "build": "tsc -b && vite build", "lint": "eslint .", "preview": "vite preview" }, "dependencies": { "@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-dialog": "^1.1.15", "@radix-ui/react-label": "^2.1.8", "@radix-ui/react-scroll-area": "^1.2.10", "@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-slider": "^1.3.6", "@radix-ui/react-slot": "^1.2.4", "@radix-ui/react-switch": "^1.2.6", "@radix-ui/react-toggle-group": "^1.1.11", "@tailwindcss/vite": "^4.2.1", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^0.577.0", "react": "^19.2.0", "react-dom": "^19.2.0", "tailwind-merge": "^3.5.0", "tailwindcss": "^4.2.1" }, "devDependencies": { "@eslint/js": "^9.39.1", "@types/node": "^24.10.1", "@types/react": "^19.2.7", "@types/react-dom": "^19.2.3", "@vitejs/plugin-react-swc": "^4.2.2", "eslint": "^9.39.1", "eslint-plugin-react-hooks": "^7.0.1", "eslint-plugin-react-refresh": "^0.4.24", "globals": "^16.5.0", "typescript": "~5.9.3", "typescript-eslint": "^8.48.0", "vite": "^7.3.1" } } ================================================ FILE: awesome_webui/src/App.tsx ================================================ import { useEffect, useRef, useState } from 'react' import { AudioLines, ChevronDown, CircleAlert, Copy, Download, FileText, Info, LoaderCircle, Plus, Settings2, Upload, } from 'lucide-react' import { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert' import { Badge } from '@/components/ui/badge' import { Button } from '@/components/ui/button' import { Card, CardContent, CardDescription, CardHeader, CardTitle, } from '@/components/ui/card' import { Collapsible, CollapsibleContent, CollapsibleTrigger, } from '@/components/ui/collapsible' import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle, } from '@/components/ui/dialog' import { Label } from '@/components/ui/label' import { ScrollArea } from '@/components/ui/scroll-area' import { Separator } from '@/components/ui/separator' import { Slider } from '@/components/ui/slider' import { Switch } from '@/components/ui/switch' import { Textarea } from '@/components/ui/textarea' import { ToggleGroup, ToggleGroupItem } from '@/components/ui/toggle-group' type AudioFormat = 'mp3' | 'wav' | 'pcm' | 'opus' type LatencyMode = 'normal' | 'balanced' const defaultInputText = `[excited, joyful tone] We're going to DISNEY WORLD! [squeal of delight] I've been saving for [emphasis] three years [breathless] and finally, FINALLY we can go! The look on your face right now is worth every extra shift I worked! [angry] After everything we've been through [break] I can't believe you would [emphasize] betray me like this. I gave you EVERYTHING! And now I'm left with nothing but memories and broken promises!` type ControlsState = { chunkLength: number maxNewTokens: number temperature: number topP: number repetitionPenalty: number normalize: boolean format: AudioFormat latency: LatencyMode } type Metrics = { textLength: number ttftMs: number receivedKb: number } type StatusState = { tone: 'error' | 'info' message: string } type ReferenceItem = { id: number name: string audio: ArrayBuffer text: string previewUrl: string } type SpeakerGroup = { id: number references: ReferenceItem[] } type PendingReference = { mode: 'create' | 'edit' speakerId: number referenceId?: number name: string audio?: ArrayBuffer text: string } const initialControls: ControlsState = { chunkLength: 1000, maxNewTokens: 2048, temperature: 0.9, topP: 0.9, repetitionPenalty: 1.05, normalize: false, format: 'mp3', latency: 'normal', } const formatMimeMap: Record = { mp3: 'audio/mpeg', wav: 'audio/wav', pcm: 'audio/pcm', opus: 'audio/opus', } function createId() { return Date.now() + Math.floor(Math.random() * 100000) } function arrayBufferToBase64(buffer: ArrayBuffer): string { const bytes = new Uint8Array(buffer) let binary = '' for (let i = 0; i < bytes.byteLength; i++) { binary += String.fromCharCode(bytes[i]) } return btoa(binary) } function createSpeakerGroup(): SpeakerGroup { return { id: createId(), references: [], } } const initialSpeakerGroup = createSpeakerGroup() function buildReferencesPayload( speakerGroups: SpeakerGroup[], includeBinaryAudio: boolean, ) { return speakerGroups.flatMap((speakerGroup) => speakerGroup.references.map((reference) => ({ text: reference.text, audio: includeBinaryAudio ? arrayBufferToBase64(reference.audio) : '