Repository: PierrunoYT/Kokoro-TTS-Local Branch: master Commit: d6565f3a71ff Files: 22 Total size: 194.3 KB Directory structure: gitextract_tx6aasjj/ ├── .dockerignore ├── .github/ │ └── workflows/ │ └── claude.yml ├── .gitignore ├── .gradio/ │ └── certificate.pem ├── CHINESE_TTS_GUIDE.md ├── Dockerfile ├── IMPROVEMENTS.md ├── LICENSE ├── README.md ├── README_CHINESE_TTS.md ├── chinese_config.py ├── chinese_tts_demo.py ├── config.py ├── dependency_checker.py ├── docker-compose.yml ├── gradio_interface.py ├── models.py ├── requirements.txt ├── setup_chinese_tts.py ├── speed_dial.py ├── test_offline.py └── tts_demo.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # Git and local metadata .git .github .gitignore # Python caches and virtual environments __pycache__/ *.py[cod] *.so venv/ .venv/ # Runtime/generated artifacts outputs/ .cache/ .gradio/ *.log # Local model artifacts (download at runtime in container) voices/ *.pth # Editor/OS files .vscode/ .idea/ .DS_Store ================================================ FILE: .github/workflows/claude.yml ================================================ name: Claude PR Assistant on: issue_comment: types: [created] pull_request_review_comment: types: [created] issues: types: [opened, assigned] pull_request_review: types: [submitted] jobs: claude-code-action: if: | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || (github.event_name == 'issues' && contains(github.event.issue.body, '@claude')) runs-on: ubuntu-latest permissions: contents: read pull-requests: read issues: read id-token: write steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 - name: Run Claude PR Action uses: anthropics/claude-code-action@beta with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} # Or use OAuth token instead: # claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} timeout_minutes: "60" # Optional: Restrict network access to specific domains only # experimental_allowed_domains: | # .anthropic.com # .github.com # api.github.com # .githubusercontent.com # bun.sh # registry.npmjs.org # .blob.core.windows.net ================================================ FILE: .gitignore ================================================ # Python __pycache__/ *.py[cod] *$py.class *.so .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # Virtual Environment venv/ ENV/ # IDE .idea/ .vscode/ *.swp *.swo # Project specific output*.wav *.pth *.onnx voices/ voices/*.pt voices/**/*.pt config.json ================================================ FILE: .gradio/certificate.pem ================================================ -----BEGIN CERTIFICATE----- MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= -----END CERTIFICATE----- ================================================ FILE: CHINESE_TTS_GUIDE.md ================================================ # Kokoro Chinese TTS Guide ## 科克罗中文文本转语音指南 Complete guide for setting up and using the Kokoro-82M-v1.1_zh Chinese TTS model locally. ## Table of Contents 1. [Overview](#overview) 2. [Installation](#installation) 3. [Quick Start](#quick-start) 4. [Available Chinese Voices](#available-chinese-voices) 5. [Advanced Usage](#advanced-usage) 6. [Troubleshooting](#troubleshooting) --- ## Overview The **Kokoro-82M-v1.1_zh** is a fine-tuned Mandarin Chinese TTS model for high-quality speech synthesis. ### Key Features - 8 Chinese voices (4 female + 4 male) - Natural Mandarin pronunciation - Adjustable speech speed (0.5x - 2.0x) - Automatic text normalization - Offline operation after setup - Cross-platform support --- ## Installation ### Prerequisites - Python 3.8+ - ~1GB free disk space - Internet connection (for initial download) ### Automated Setup (Recommended) ```bash python setup_chinese_tts.py ``` This script automatically downloads the model and all voice files. ### Manual Setup 1. **Download the model:** ```bash # From Hugging Face git clone https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh # Place kokoro-v1_1-zh.pth in project root ``` 2. **Download voice files** to `voices/` directory: - Female: `zf_xiaobei.pt`, `zf_xiaoni.pt`, `zf_xiaoxiao.pt`, `zf_xiaoyi.pt` - Male: `zm_yunjian.pt`, `zm_yunxi.pt`, `zm_yunxia.pt`, `zm_yunyang.pt` 3. **Install dependencies:** ```bash pip install -r requirements.txt ``` --- ## Quick Start ### Interactive CLI ```bash python chinese_tts_demo.py ``` The interactive menu provides: 1. List available voices 2. Generate speech from custom text 3. Generate from sample texts 4. Help information 5. Exit ### Python API ```python from chinese_tts_demo import load_chinese_model, generate_chinese_speech, save_audio import torch # Load model device = 'cuda' if torch.cuda.is_available() else 'cpu' model = load_chinese_model('kokoro-v1_1-zh.pth', device) # Generate speech text = "你好,世界!这是一个测试。" audio, _ = generate_chinese_speech(model, text, 'zf_xiaobei', device, speed=1.0) # Save audio if audio is not None: save_audio(audio, 'output.wav') ``` --- ## Available Chinese Voices ### Female Voices (女性声音) | Voice ID | Name | Description | Quality | |----------|------|-------------|---------| | `zf_xiaobei` | 晓蓓 | Young, energetic | B | | `zf_xiaoni` | 晓妮 | Clear, friendly | B+ | | `zf_xiaoxiao` | 晓晓 | Soft, gentle | B | | `zf_xiaoyi` | 晓艺 | Professional, articulate | A- | ### Male Voices (男性声音) | Voice ID | Name | Description | Quality | |----------|------|-------------|---------| | `zm_yunjian` | 云健 | Strong, confident | B- | | `zm_yunxi` | 云析 | Warm, professional | B+ | | `zm_yunxia` | 云夏 | Calm, steady | B | | `zm_yunyang` | 云阳 | Resonant, deep | B- | ### Recommendations - **Natural speech**: `zf_xiaoyi` (female) or `zm_yunxi` (male) - **Energetic content**: `zf_xiaobei` (female) or `zm_yunjian` (male) - **Gentle/soft content**: `zf_xiaoxiao` (female) or `zm_yunxia` (male) --- ## Troubleshooting ### "WARNING - words count mismatch" **Cause**: Wrong phonemizer language configuration. **Solution**: Use `chinese_tts_demo.py` (not `tts_demo.py`). The code automatically initializes the Chinese phonemizer. ### "Model file not found" **Solution**: Run `python setup_chinese_tts.py` or download manually: ```bash python -c "from huggingface_hub import hf_hub_download; hf_hub_download('hexgrad/Kokoro-82M-v1.1-zh', 'kokoro-v1_1-zh.pth', local_dir='.')" ``` ### "Voice file not found" **Solution**: Run `python setup_chinese_tts.py` to download all voice files automatically. ### "No Chinese phonemizer support" **Solution**: TTS works without phonemizer (no phoneme visualization). To install: ```bash pip install phonemizer espeakng-loader # Then install espeak-ng for your platform ``` ### Out of memory errors **Solution**: - System auto-falls back to CPU - Use shorter text segments - Close other applications - Use already-loaded voice files --- ## Advanced Usage ### Text Processing The system automatically handles Chinese character validation, normalization, punctuation, and text segmentation. Use utilities for manual processing: ```python from chinese_config import ChineseTextProcessor # Check if text is Chinese is_chinese = ChineseTextProcessor.is_chinese("你好") # Normalize text normalized = ChineseTextProcessor.normalize_chinese_text("你好 , 世界 !") # Split long text segments = ChineseTextProcessor.split_chinese_text("长文本...", max_length=100) ``` ### Batch Processing ```python texts = ["你好,世界", "欢迎使用中文文本转语音", "这是一个测试"] for i, text in enumerate(texts): audio, _ = generate_chinese_speech(model, text, 'zf_xiaobei', device) if audio is not None: save_audio(audio, f'output_{i}.wav') ``` ### Performance Tips - **First run**: Slower due to model loading - **Voice caching**: Faster subsequent generations - **GPU**: ~3x faster with CUDA - **Memory**: ~400MB when loaded **Typical generation times (with GPU):** - Short text (< 30 chars): ~0.5s - Medium text (30-100 chars): ~1-2s - Long text (100+ chars): ~2-5s ### Offline Usage After initial setup, run offline: ```bash # Linux/macOS export HF_HUB_OFFLINE=1 # Windows PowerShell $env:HF_HUB_OFFLINE="1" # Windows CMD set HF_HUB_OFFLINE=1 python chinese_tts_demo.py ``` ## FAQ **Q: Can I use this with English TTS?** A: Yes, but use different scripts: `tts_demo.py` for English, `chinese_tts_demo.py` for Chinese. **Q: Can I mix Chinese and English text?** A: The system is optimized for pure Chinese text. Mixed text may have lower quality. **Q: How do I improve audio quality?** A: Try different voices, adjust speed, ensure sufficient disk space, and use GPU if available. **Q: Is there a REST API?** A: Not yet, but you can modify `gradio_interface.py` to support Chinese. ## Additional Resources - **Kokoro Project**: https://github.com/hexgrad/kokoro - **Model Repository**: https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh - **Main README**: See [README.md](README.md) for general project information --- **Version**: 1.0 | **Last Updated**: 2024 ================================================ FILE: Dockerfile ================================================ FROM python:3.11-slim ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ HF_HOME=/app/.cache/huggingface WORKDIR /app RUN apt-get update \ && apt-get install -y --no-install-recommends \ build-essential \ cmake \ curl \ ffmpeg \ espeak-ng \ libsndfile1 \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt ./ # NOTE: requirements.txt uses unpinned dependencies for flexibility. # For fully reproducible builds, generate a lock file: # pip install -r requirements.txt && pip freeze > requirements-lock.txt # Then replace "requirements.txt" below with "requirements-lock.txt". RUN pip install --no-cache-dir --upgrade pip setuptools wheel \ && pip install --no-cache-dir -r requirements.txt \ && python -m spacy download en_core_web_sm \ && python -c "import spacy; spacy.load('en_core_web_sm'); print('spaCy model OK')" COPY . . RUN useradd --create-home --uid 10001 appuser \ && mkdir -p /app/outputs /app/voices /app/.cache \ && chown -R appuser:appuser /app USER appuser EXPOSE 7860 # Allow extra time on first start for model/voice downloads from Hugging Face HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ CMD curl -f http://localhost:7860/ || exit 1 CMD ["python", "gradio_interface.py", "--host", "0.0.0.0", "--port", "7860"] ================================================ FILE: IMPROVEMENTS.md ================================================ # Kokoro TTS Local - Code Improvements Summary This document summarizes all the improvements made to fix the issues identified in the codebase analysis. ## ✅ Completed Improvements ### 1. Replace Monkey Patching with Proper Subclassing **Files Modified:** `models.py`, `gradio_interface.py` - **Issue:** The code was monkey patching `KPipeline.load_voice` and `json.load` functions, which could lead to unexpected behavior. - **Solution:** Created `EnhancedKPipeline` class that properly inherits from `KPipeline` and overrides the `load_voice` method. - **Benefits:** - More maintainable and predictable code - Better error handling and logging - Eliminates potential conflicts with library updates ### 2. Standardize File Path Handling **Files Modified:** `models.py`, `gradio_interface.py`, `tts_demo.py` - **Issue:** Inconsistent use of `os.path` vs `pathlib.Path` across the codebase. - **Solution:** Standardized on using `pathlib.Path` throughout with `.resolve()` for consistent path handling. - **Benefits:** - Better cross-platform compatibility - More readable and maintainable code - Consistent path resolution ### 3. Create Centralized Configuration System **Files Created:** `config.py` - **Issue:** Hardcoded constants scattered across multiple files with inconsistent values. - **Solution:** Created `TTSConfig` class with centralized configuration management. - **Features:** - JSON-based configuration with defaults - Dot notation access (e.g., `config.get("audio.sample_rate")`) - Validation methods for common settings - Easy configuration persistence - **Benefits:** - Single source of truth for all settings - Easy customization without code changes - Consistent validation across components ### 4. Fix Format Discrepancy **Files Modified:** `speed_dial.py` - **Issue:** `speed_dial.py` supported "ogg" format while `gradio_interface.py` supported "aac" format. - **Solution:** Standardized on supporting "wav", "mp3", and "aac" formats across all components. - **Benefits:** Consistent format support throughout the application ### 5. Improve Error Handling and Logging **Files Modified:** `models.py`, `gradio_interface.py`, `tts_demo.py` - **Issue:** Inconsistent error messages and reliance on print statements. - **Solution:** - Implemented proper logging with the `logging` module - Added structured error handling with context - Improved user-friendly error messages - **Benefits:** - Better debugging capabilities - Consistent error reporting - Configurable logging levels ### 6. Enhance Voice Download Mechanism **Files Modified:** `models.py` - **Issue:** Sequential downloads with basic retry logic and no progress indication. - **Solution:** - Implemented parallel downloads with `ThreadPoolExecutor` - Added progress bars with `tqdm` - Enhanced retry logic with exponential backoff - Better file integrity checking - **Benefits:** - Faster download times - Better user experience with progress indication - More robust download handling ### 7. Add Dependency Version Checks **Files Created:** `dependency_checker.py` **Files Modified:** `requirements.txt` - **Issue:** No validation of dependency versions or availability. - **Solution:** - Created comprehensive dependency checker - Added version validation for all dependencies - CUDA availability detection - Clear installation instructions for missing dependencies - **Benefits:** - Early detection of compatibility issues - Better user guidance for setup - Proactive problem prevention ### 8. Improve Thread Safety **Files Modified:** `models.py` - **Issue:** Potential race conditions in multi-threaded environments (Gradio web interface). - **Solution:** - Added separate locks for different operations (`_voice_cache_lock`, `_download_lock`) - Enhanced thread-safe resource management - Better synchronization for shared resources - **Benefits:** - Safer concurrent operations - Reduced risk of race conditions - Better stability in multi-user scenarios ### 9. Enhance Memory Management **Files Modified:** `gradio_interface.py`, `tts_demo.py` - **Issue:** No memory monitoring or management for large inputs. - **Solution:** - Added memory monitoring with `psutil` - Dynamic text length limits based on available memory - Proactive garbage collection and CUDA cache clearing - Memory warnings for low-memory situations - **Benefits:** - Better handling of resource-constrained environments - Reduced risk of out-of-memory errors - Improved user experience with appropriate warnings ## 📁 New Files Created 1. **`config.py`** - Centralized configuration management system 2. **`dependency_checker.py`** - Comprehensive dependency validation 3. **`IMPROVEMENTS.md`** - This summary document ## 🔧 Files Modified 1. **`models.py`** - Core improvements to pipeline handling, logging, thread safety 2. **`gradio_interface.py`** - Memory management, path standardization, enhanced pipeline usage 3. **`tts_demo.py`** - Memory management, path standardization, improved error handling 4. **`speed_dial.py`** - Format consistency fix 5. **`requirements.txt`** - Added version constraints and new dependencies ## 🚀 Key Benefits - **Maintainability:** Cleaner, more organized code structure - **Reliability:** Better error handling and resource management - **Performance:** Parallel downloads, memory optimization, thread safety - **User Experience:** Progress indicators, better error messages, memory warnings - **Compatibility:** Standardized paths, dependency validation, version checking - **Configurability:** Centralized settings management ### 10. Security and Code Quality Improvements **Files Modified:** `models.py`, `gradio_interface.py`, `speed_dial.py`, `tts_demo.py` - **Issue:** Security vulnerabilities and code quality issues including unsafe torch.load usage, public Gradio exposure, and insufficient input validation. - **Solution:** - **Security Fixes:** - Fixed critical `torch.load` security vulnerability by using `weights_only=True` - Removed public exposure of Gradio interface (`share=False`) - Added comprehensive input validation for speed dial presets with regex patterns - Enhanced resource management and cleanup with proper warnings - **Code Quality Improvements:** - Replaced hardcoded values with named constants (`MAX_TEXT_LENGTH`, `DEFAULT_SAMPLE_RATE`, etc.) - Added missing type hints for better code safety and IDE support - Enhanced race condition protection with proper locking mechanisms - Improved error handling consistency with specific error types - Added proper warning suppression for model-related deprecation warnings - **Benefits:** - **Security:** Protection against arbitrary code execution via malicious model files - **Privacy:** Prevents accidental public exposure of the interface - **Reliability:** Better input validation prevents crashes and unexpected behavior - **Maintainability:** Named constants and type hints improve code readability - **Stability:** Enhanced thread safety and error handling ## 📋 Usage Notes ### Running Dependency Check ```bash python dependency_checker.py ``` ### Using Configuration System ```python from config import config sample_rate = config.get("audio.sample_rate") config.set("audio.sample_rate", 48000) config.save() ``` ### Memory Monitoring The system now automatically monitors memory usage and adjusts behavior accordingly: - Reduces text limits on low memory systems - Provides warnings when memory is low - Automatically triggers garbage collection when needed All improvements maintain backward compatibility while significantly enhancing the robustness and maintainability of the codebase. ## 📈 Recent Updates (July 2025) ### Latest Commits Summary #### v1.0.3 - Enhanced Audio Processing Support **Commit:** `ca106b3` - feat(deps): add torchaudio for enhanced audio processing **Date:** July 19, 2025 - **Added:** `torchaudio` dependency to requirements.txt - **Purpose:** Provides comprehensive PyTorch audio processing capabilities - **Benefits:** Enhanced audio handling, better format support, improved compatibility with PyTorch ecosystem #### v1.0.2 - Comprehensive System Improvements **Commit:** `14fc956` - feat: add comprehensive system improvements and documentation **Date:** July 19, 2025 Major improvements including all the fixes documented above: - Centralized configuration management system (`config.py`) - Dependency validation and system checks (`dependency_checker.py`) - Enhanced security with proper torch.load usage and input validation - Improved code quality with type hints and named constants - Memory management and monitoring capabilities - Enhanced pipeline with better error handling - Parallel downloads with progress tracking - Standardized path handling across all components #### v1.0.1 - Dependency Flexibility **Commit:** `41c8da8` - remove version constraints from requirements.txt **Date:** July 19, 2025 - **Changed:** Removed strict version constraints from all dependencies - **Benefits:** Better compatibility with different Python environments, reduced conflicts, easier installation ### Windows Host Resolution Fix (Current Session) **Issue:** Empty UI on Windows due to `0.0.0.0` host resolution problems **Solution:** Added flexible command-line argument parsing - **Added:** `argparse` support for `--host` and `--port` arguments - **Changed:** Default host from `0.0.0.0` to `127.0.0.1` - **Usage:** ```bash python gradio_interface.py --port 8000 # Custom port python gradio_interface.py --host 0.0.0.0 # Custom host ``` - **Benefits:** Resolves Windows issues, provides deployment flexibility, enables multiple instances ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS Copyright 2025 PierrunoYT (Kokoro TTS Local) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Kokoro TTS Local A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading, automatic dependency management, and a web interface. ## Features - Local text-to-speech synthesis using the Kokoro-82M model - Multiple voice support with easy voice selection (54 voices available across 8 languages) - Automatic model and voice downloading from Hugging Face - **Offline mode support** - Run completely offline after initial setup - Phoneme output support and visualization - Interactive CLI and web interface - Voice listing functionality - Cross-platform support (Windows, Linux, macOS) - Real-time generation progress display - Multiple output formats (WAV, MP3, AAC) - Enhanced security and code quality features - Centralized configuration management - Comprehensive dependency validation - Memory management and optimization - Thread-safe operations for multi-user scenarios ## Recent Improvements This project has been significantly enhanced with security and code quality improvements: ### 🔒 Security Enhancements - **Fixed critical security vulnerability** in model loading by using `weights_only=True` for `torch.load` - **Removed public exposure** of Gradio interface (`share=False`) to prevent accidental public access - **Added comprehensive input validation** for all user inputs with regex pattern matching - **Enhanced resource management** with proper cleanup and warning systems ### 🛠️ Code Quality Improvements - **Replaced hardcoded values** with named constants for better maintainability - **Added comprehensive type hints** throughout the codebase for better IDE support and safety - **Enhanced thread safety** with proper locking mechanisms for concurrent operations - **Improved error handling** with specific error types and consistent messaging - **Added proper warning suppression** for model-related deprecation warnings ### 📁 New Components - **`config.py`** - Centralized configuration management system - **`dependency_checker.py`** - Comprehensive dependency validation and CUDA detection - **`IMPROVEMENTS.md`** - Detailed documentation of all enhancements For complete details, see [`IMPROVEMENTS.md`](IMPROVEMENTS.md). ## Prerequisites - Python 3.8 or higher - FFmpeg (optional, for MP3/AAC conversion) - CUDA-compatible GPU (optional, for faster generation) - Git (for version control and package management) ## Installation 1. Clone the repository and create a Python virtual environment: ```bash # Windows python -m venv venv .\venv\Scripts\activate # Linux/macOS python3 -m venv venv source venv/bin/activate ``` 2. Install dependencies: ```bash pip install -r requirements.txt ``` **Alternative Installation (Simplified):** For a simpler setup, you can also install the official Kokoro package directly: ```bash pip install kokoro soundfile apt-get install espeak-ng # On Linux # or brew install espeak # On macOS ``` 3. (Optional) For GPU acceleration, install PyTorch with CUDA support: ```bash # For CUDA 11.8 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # For CUDA 12.1 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 # For CUDA 12.6 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 # For CUDA 12.8 (for RTX 50-series cards) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 ``` You can verify CUDA support is enabled with: ```python import torch print(torch.cuda.is_available()) # Should print True if CUDA is available ``` The system will automatically download required models and voice files on first run. ## Docker Quick Start This project can be run in a CPU-first Docker setup with runtime model and voice downloads. ### Build and Run with Docker **Linux/macOS (bash/zsh):** ```bash docker build -t kokoro-tts-local:cpu . docker run --rm -it \ -p 7860:7860 \ -v "$(pwd)/outputs:/app/outputs" \ -v "$(pwd)/voices:/app/voices" \ -v "$(pwd)/.cache:/app/.cache" \ kokoro-tts-local:cpu ``` **Windows (PowerShell):** ```powershell docker build -t kokoro-tts-local:cpu . docker run --rm -it ` -p 7860:7860 ` -v "${PWD}/outputs:/app/outputs" ` -v "${PWD}/voices:/app/voices" ` -v "${PWD}/.cache:/app/.cache" ` kokoro-tts-local:cpu ``` Open `http://localhost:7860` in your browser. ### Run with Docker Compose ```bash docker compose up --build ``` ### Docker Notes - First startup can take longer because model and voice files are downloaded from Hugging Face. - Volumes for `outputs`, `voices`, and `.cache` are recommended so downloads and generated audio persist across restarts. - The Docker image pre-installs `en_core_web_sm` during build to avoid non-root runtime initialization errors. - This initial Docker support is CPU-first. GPU and pre-baked model image variants are intentionally out of scope for this first implementation. - To force offline mode after assets are downloaded, set `HF_HUB_OFFLINE=1` in your Docker environment. ## Offline Mode After the initial setup, you can run Kokoro-TTS-Local completely offline without an internet connection. ### Quick Start - Offline Mode **Linux/macOS:** ```bash export HF_HUB_OFFLINE=1 python tts_demo.py ``` **Windows (PowerShell):** ```powershell $env:HF_HUB_OFFLINE="1" python tts_demo.py ``` **Windows (Command Prompt):** ```cmd set HF_HUB_OFFLINE=1 python tts_demo.py ``` ### Requirements for Offline Mode Before enabling offline mode, ensure you have: 1. Run the application at least once with internet connection 2. Downloaded the model file (`kokoro-v1_0.pth`) 3. Downloaded the config file (`config.json`) 4. Downloaded at least one voice file in the `voices/` directory ### Testing Offline Mode Use the provided test script to verify your offline setup: ```bash export HF_HUB_OFFLINE=1 # Enable offline mode python test_offline.py # Run the test ``` The script checks: - Offline mode environment variables are set - Required files exist (`kokoro-v1_0.pth`, `config.json`, `voices/`) - All required Python packages are installed - Model initializes correctly - Voices can be listed - Speech can be generated and saved For detailed offline usage instructions, set `HF_HUB_OFFLINE=1` before running and use `test_offline.py` to verify your setup. ## Usage You can use either the command-line interface or the web interface: ### Command Line Interface Run the interactive CLI: ```bash python tts_demo.py ``` The CLI provides an interactive menu with the following options: 1. List available voices - Shows all available voice options 2. Generate speech - Interactive process to: - Select a voice from the numbered list - Enter text to convert to speech - Adjust speech speed (0.5-2.0) 3. Exit - Quit the program Example session: ``` === Kokoro TTS Menu === 1. List available voices 2. Generate speech 3. Exit Select an option (1-3): 2 Available voices: 1. af_alloy 2. af_aoede 3. af_bella ... Select a voice number (or press Enter for default 'af_bella'): 3 Enter the text you want to convert to speech (or press Enter for default text) > Hello, world! Enter speech speed (0.1-3.0, default 1.0): 1.2 Generating speech for: 'Hello, world!' Using voice: af_bella Speed: 1.2x ... ``` ### Web Interface For a more user-friendly experience, launch the web interface: ```bash python gradio_interface.py ``` Then open your browser to the URL shown in the console (typically http://localhost:7860). The web interface provides: - Easy voice selection from a dropdown menu - Text input field with examples - Speed control slider (0.5–2.0x) - Output format selection (WAV, MP3, AAC) - Real-time generation progress - Audio playback in the browser - Download options for generated audio - **Speed Dial presets** — save, load, and delete frequently used voice/text/speed combinations ### Dependency Validation Before running the application, you can validate your system setup: ```bash python dependency_checker.py ``` This will check: - Python version compatibility - All required dependencies and their versions - CUDA availability and GPU detection - System memory and disk space - Audio system functionality ### Configuration Management The system now includes centralized configuration management: ```python from config import config # Get configuration values sample_rate = config.get("audio.sample_rate") max_text_length = config.get("limits.max_text_length") # Set configuration values config.set("audio.sample_rate", 48000) config.set("interface.auto_play", True) # Save configuration config.save() ``` Configuration files are automatically created with sensible defaults. ## Available Voices The system includes 54 different voices across 8 languages: ### 🇺🇸 American English (20 voices) **Language code: 'a'** **Female voices (af_*):** - af_heart: ❤️ Premium quality voice (Grade A) - af_alloy: Clear and professional (Grade C) - af_aoede: Smooth and melodic (Grade C+) - af_bella: 🔥 Warm and friendly (Grade A-) - af_jessica: Natural and engaging (Grade D) - af_kore: Bright and energetic (Grade C+) - af_nicole: 🎧 Professional and articulate (Grade B-) - af_nova: Modern and dynamic (Grade C) - af_river: Soft and flowing (Grade D) - af_sarah: Casual and approachable (Grade C+) - af_sky: Light and airy (Grade C-) **Male voices (am_*):** - am_adam: Strong and confident (Grade F+) - am_echo: Resonant and clear (Grade D) - am_eric: Professional and authoritative (Grade D) - am_fenrir: Deep and powerful (Grade C+) - am_liam: Friendly and conversational (Grade D) - am_michael: Warm and trustworthy (Grade C+) - am_onyx: Rich and sophisticated (Grade D) - am_puck: Playful and energetic (Grade C+) - am_santa: Holiday-themed voice (Grade D-) ### 🇬🇧 British English (8 voices) **Language code: 'b'** **Female voices (bf_*):** - bf_alice: Refined and elegant (Grade D) - bf_emma: Warm and professional (Grade B-) - bf_isabella: Sophisticated and clear (Grade C) - bf_lily: Sweet and gentle (Grade D) **Male voices (bm_*):** - bm_daniel: Polished and professional (Grade D) - bm_fable: Storytelling and engaging (Grade C) - bm_george: Classic British accent (Grade C) - bm_lewis: Modern British accent (Grade D+) ### 🇯🇵 Japanese (5 voices) **Language code: 'j'** **Female voices (jf_*):** - jf_alpha: Standard Japanese female (Grade C+) - jf_gongitsune: Based on classic tale (Grade C) - jf_nezumi: Mouse bride tale voice (Grade C-) - jf_tebukuro: Glove story voice (Grade C) **Male voices (jm_*):** - jm_kumo: Spider thread tale voice (Grade C-) ### 🇨🇳 Mandarin Chinese (8 voices) **Language code: 'z'** **Female voices (zf_*):** - zf_xiaobei: Chinese female voice (Grade D) - zf_xiaoni: Chinese female voice (Grade D) - zf_xiaoxiao: Chinese female voice (Grade D) - zf_xiaoyi: Chinese female voice (Grade D) **Male voices (zm_*):** - zm_yunjian: Chinese male voice (Grade D) - zm_yunxi: Chinese male voice (Grade D) - zm_yunxia: Chinese male voice (Grade D) - zm_yunyang: Chinese male voice (Grade D) **Note:** Run `python setup_chinese_tts.py` to download the Chinese model and voice files automatically. For full usage details see [CHINESE_TTS_GUIDE.md](CHINESE_TTS_GUIDE.md) or [README_CHINESE_TTS.md](README_CHINESE_TTS.md). ### 🇪🇸 Spanish (3 voices) **Language code: 'e'** **Female voices (ef_*):** - ef_dora: Spanish female voice **Male voices (em_*):** - em_alex: Spanish male voice - em_santa: Spanish holiday voice ### 🇫🇷 French (1 voice) **Language code: 'f'** **Female voices (ff_*):** - ff_siwis: French female voice (Grade B-) ### 🇮🇳 Hindi (4 voices) **Language code: 'h'** **Female voices (hf_*):** - hf_alpha: Hindi female voice (Grade C) - hf_beta: Hindi female voice (Grade C) **Male voices (hm_*):** - hm_omega: Hindi male voice (Grade C) - hm_psi: Hindi male voice (Grade C) ### 🇮🇹 Italian (2 voices) **Language code: 'i'** **Female voices (if_*):** - if_sara: Italian female voice (Grade C) **Male voices (im_*):** - im_nicola: Italian male voice (Grade C) ### 🇧🇷 Brazilian Portuguese (3 voices) **Language code: 'p'** **Female voices (pf_*):** - pf_dora: Portuguese female voice **Male voices (pm_*):** - pm_alex: Portuguese male voice - pm_santa: Portuguese holiday voice **Note:** Quality grades (A to F) indicate the overall quality based on training data quality and duration. Higher grades generally produce better speech quality. ## Project Structure ``` . ├── .cache/ # Cache directory for downloaded models │ └── huggingface/ # Hugging Face model cache ├── .git/ # Git repository data ├── .gitignore # Git ignore rules ├── __pycache__/ # Python cache files ├── voices/ # Voice model files (downloaded on demand) │ └── *.pt # Individual voice files ├── venv/ # Python virtual environment ├── outputs/ # Generated audio files directory ├── LICENSE # Apache 2.0 License file ├── README.md # Project documentation ├── README_CHINESE_TTS.md # Chinese TTS quick reference ├── CHINESE_TTS_GUIDE.md # Complete Chinese TTS guide ├── IMPROVEMENTS.md # Detailed improvement documentation ├── models.py # Core TTS model implementation ├── gradio_interface.py # Web interface implementation ├── tts_demo.py # CLI implementation (English) ├── chinese_tts_demo.py # CLI implementation (Chinese, 5-option menu) ├── chinese_config.py # Chinese text processing and voice configuration ├── setup_chinese_tts.py # Downloads Chinese model and voice files ├── config.py # Centralized configuration management ├── dependency_checker.py # Dependency validation and system checks ├── speed_dial.py # Speed Dial preset management (save/load/delete) ├── test_offline.py # Offline mode verification script └── requirements.txt # Python dependencies (no version constraints) ``` ## Model Information The project uses the latest Kokoro model from Hugging Face: - Repository: [hexgrad/Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) - Model file: `kokoro-v1_0.pth` (downloaded automatically) - Sample rate: 24kHz - Voice files: Located in the `voices/` directory (downloaded automatically) - Available voices: 54 voices across 8 languages - Languages: American English ('a'), British English ('b'), Japanese ('j'), Mandarin Chinese ('z'), Spanish ('e'), French ('f'), Hindi ('h'), Italian ('i'), Brazilian Portuguese ('p') - Model size: 82M parameters ## Troubleshooting Common issues and solutions: ### Quick System Check First, run the dependency checker to identify potential issues: ```bash python dependency_checker.py ``` This will automatically detect and report: - Missing or incompatible dependencies - CUDA/GPU configuration issues - System resource problems - Audio system issues ### Common Issues 1. **Offline Mode / Network Connection Issues** - **Problem:** Getting "Failed to resolve 'huggingface.co'" errors even with cached files - **Solution:** Enable offline mode with `export HF_HUB_OFFLINE=1` (Linux/macOS) or `$env:HF_HUB_OFFLINE="1"` (Windows) - **Verify:** Run `python test_offline.py` to confirm your offline setup is working 2. **Model Download Issues** - Ensure stable internet connection - Check Hugging Face is accessible - Verify sufficient disk space - Try clearing the `.cache/huggingface` directory 3. **CUDA/GPU Issues** - Verify CUDA installation with `nvidia-smi` - Update GPU drivers - Install PyTorch with CUDA support using the appropriate command: ```bash # For CUDA 11.8 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # For CUDA 12.1 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 # For CUDA 12.6 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 # For CUDA 12.8 (for RTX 50-series cards) pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 ``` - Verify CUDA is available in PyTorch: ```python import torch print(torch.cuda.is_available()) # Should print True ``` - Fall back to CPU if needed 4. **Audio Output Issues** - Check system audio settings - Verify output directory permissions - Install FFmpeg for MP3/AAC support - Try different output formats 5. **Voice File Issues** - Delete and let system redownload voice files - Check `voices/` directory permissions - Verify voice file integrity - Try using a different voice 6. **Web Interface Issues** - Check port 7860 availability - Try different browser - Clear browser cache - Check network firewall settings For any other issues: 1. Check the console output for error messages 2. Verify all prerequisites are installed 3. Ensure virtual environment is activated 4. Check system resource usage 5. Try reinstalling dependencies ## Contributing Feel free to contribute by: 1. Opening issues for bugs or feature requests 2. Submitting pull requests with improvements 3. Helping with documentation 4. Testing different voices and reporting issues 5. Suggesting new features or optimizations 6. Testing on different platforms and reporting results ## License Apache 2.0 - See LICENSE file for details ================================================ FILE: README_CHINESE_TTS.md ================================================ # Kokoro Chinese TTS - Quick Reference Quick start guide for Chinese TTS. For complete documentation, see [CHINESE_TTS_GUIDE.md](CHINESE_TTS_GUIDE.md). ## Quick Start ```bash # 1. Setup (downloads model and voices) python setup_chinese_tts.py # 2. Run interactive demo python chinese_tts_demo.py ``` ## Python API ```python from chinese_tts_demo import load_chinese_model, generate_chinese_speech, save_audio import torch # Load model device = 'cuda' if torch.cuda.is_available() else 'cpu' model = load_chinese_model('kokoro-v1_1-zh.pth', device) # Generate speech audio, _ = generate_chinese_speech( model, "你好,世界!", # Your Chinese text 'zf_xiaobei', # Voice ID device, speed=1.0 ) # Save audio if audio is not None: save_audio(audio, 'output.wav') ``` ## Available Voices **Female (女性)**: `zf_xiaobei`, `zf_xiaoni`, `zf_xiaoxiao`, `zf_xiaoyi` **Male (男性)**: `zm_yunjian`, `zm_yunxi`, `zm_yunxia`, `zm_yunyang` **Recommended**: `zf_xiaoyi` (female) or `zm_yunxi` (male) for natural speech. ## Key Features - ✅ 8 Chinese voices (4 female + 4 male) - ✅ Natural Mandarin pronunciation - ✅ Adjustable speed (0.5x - 2.0x) - ✅ Offline operation after setup - ✅ Cross-platform support ## Troubleshooting | Issue | Solution | |-------|----------| | Model not found | Run `python setup_chinese_tts.py` | | Voice files missing | Run `python setup_chinese_tts.py` | | "words count mismatch" warning | Use `chinese_tts_demo.py` (not `tts_demo.py`) | | Out of memory | System auto-falls back to CPU | ## Documentation - **Complete Guide**: [CHINESE_TTS_GUIDE.md](CHINESE_TTS_GUIDE.md) - **Main README**: [README.md](README.md) --- **Version**: 1.0 | **Model**: Kokoro-82M-v1.1_zh ================================================ FILE: chinese_config.py ================================================ """ Chinese TTS Configuration Module for Kokoro-v1.1-zh ==================================================== This module provides specialized configuration and utilities for the Kokoro Chinese TTS model. It handles Chinese-specific phonemization, text processing, and voice management. """ import os import json from pathlib import Path from typing import Dict, Any, Optional, List import logging logger = logging.getLogger(__name__) # Chinese language code CHINESE_LANG_CODE = 'z' # Chinese Model Configuration CHINESE_MODEL_CONFIG = { "model_name": "Kokoro-v1.1-zh", "model_file": "kokoro-v1_1-zh.pth", "repo_id": "hexgrad/Kokoro-82M-v1.1-zh", "language_code": 'z', "description": "Kokoro 82M Chinese (Mandarin) TTS Model v1.1", "phonemizer": "espeak-zh", # Specialized Chinese phonemizer "sample_rate": 24000, "voice_prefix": ["zf_", "zm_"], # Chinese female (zf_) and male (zm_) voices } # Chinese Voice Files - 8 voices total (4 female + 4 male) CHINESE_VOICES = { # Female voices "zf_xiaobei": { "name": "晓蓓", "gender": "Female", "description": "Young, energetic female voice", "language": "Mandarin Chinese", "file": "zf_xiaobei.pt" }, "zf_xiaoni": { "name": "晓妮", "gender": "Female", "description": "Clear, friendly female voice", "language": "Mandarin Chinese", "file": "zf_xiaoni.pt" }, "zf_xiaoxiao": { "name": "晓晓", "gender": "Female", "description": "Soft, gentle female voice", "language": "Mandarin Chinese", "file": "zf_xiaoxiao.pt" }, "zf_xiaoyi": { "name": "晓艺", "gender": "Female", "description": "Professional, articulate female voice", "language": "Mandarin Chinese", "file": "zf_xiaoyi.pt" }, # Male voices "zm_yunjian": { "name": "云健", "gender": "Male", "description": "Strong, confident male voice", "language": "Mandarin Chinese", "file": "zm_yunjian.pt" }, "zm_yunxi": { "name": "云析", "gender": "Male", "description": "Warm, professional male voice", "language": "Mandarin Chinese", "file": "zm_yunxi.pt" }, "zm_yunxia": { "name": "云夏", "gender": "Male", "description": "Calm, steady male voice", "language": "Mandarin Chinese", "file": "zm_yunxia.pt" }, "zm_yunyang": { "name": "云阳", "gender": "Male", "description": "Resonant, deep male voice", "language": "Mandarin Chinese", "file": "zm_yunyang.pt" } } class ChineseTextProcessor: """Handle Chinese-specific text processing and normalization""" @staticmethod def is_chinese(text: str) -> bool: """Check if text contains Chinese characters""" for char in text: if '\u4e00' <= char <= '\u9fff': # Unicode range for CJK unified ideographs return True return False @staticmethod def normalize_chinese_text(text: str) -> str: """Normalize Chinese text for TTS processing""" # Remove extra whitespace text = ' '.join(text.split()) # Ensure proper spacing around punctuation import re # Add space after sentence punctuation, removing any existing spaces first. # Keep brackets/quotes untouched to avoid introducing awkward spaces. text = re.sub(r"\s*([。,!?;:])\s*", r"\1 ", text) # Clean up any double spaces that may have been created text = ' '.join(text.split()) return text.strip() @staticmethod def split_chinese_text(text: str, max_length: int = 100) -> List[str]: """Split Chinese text into proper segments for TTS processing Args: text: Chinese text to split max_length: Maximum characters per segment Returns: List of text segments """ segments = [] current_segment = "" for char in text: current_segment += char # Split on punctuation or max length if char in '。!?;,\n' or len(current_segment) >= max_length: if current_segment.strip(): segments.append(current_segment.strip()) current_segment = "" # Add remaining text if current_segment.strip(): segments.append(current_segment.strip()) return segments class ChineseTTSConfig: """Specialized configuration manager for Chinese TTS""" def __init__(self, config_file: Optional[str] = None): self.config_file = Path(config_file or "chinese_tts_config.json").resolve() self.chinese_voices_dir = Path("voices").resolve() self._config = self._load_default_config() self._load_config_file() def _load_default_config(self) -> Dict[str, Any]: """Load default configuration values for Chinese TTS""" return { "model": CHINESE_MODEL_CONFIG, "voices": CHINESE_VOICES, "phonemizer": { "backend": "espeak-ng", "language": "zh", # Chinese language code for espeak "preserve_punctuation": True, "strip": False }, "text_processing": { "normalize": True, "split_long_text": True, "max_segment_length": 100, "min_segment_length": 10 }, "audio": { "sample_rate": 24000, "default_speed": 1.0, "min_speed": 0.5, "max_speed": 2.0 }, "paths": { "voices_dir": "voices", "models_dir": ".", "output_dir": "outputs" } } def _load_config_file(self): """Load configuration from file if it exists""" if self.config_file.exists(): try: with open(self.config_file, 'r', encoding='utf-8') as f: file_config = json.load(f) self._merge_config(file_config) logger.info(f"Loaded Chinese TTS configuration from {self.config_file}") except (json.JSONDecodeError, IOError) as e: logger.warning(f"Failed to load Chinese config file {self.config_file}: {e}") def _merge_config(self, file_config: Dict[str, Any]): """Merge file configuration with default configuration""" def merge_dict(default: Dict, override: Dict): for key, value in override.items(): if key in default and isinstance(default[key], dict) and isinstance(value, dict): merge_dict(default[key], value) else: default[key] = value merge_dict(self._config, file_config) def get(self, key: str, default: Any = None) -> Any: """Get configuration value using dot notation""" keys = key.split('.') value = self._config for k in keys: if isinstance(value, dict) and k in value: value = value[k] else: return default return value def set(self, key: str, value: Any): """Set configuration value using dot notation""" keys = key.split('.') config = self._config for k in keys[:-1]: if k not in config: config[k] = {} config = config[k] config[keys[-1]] = value def save(self): """Save current configuration to file""" try: self.config_file.parent.mkdir(parents=True, exist_ok=True) with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(self._config, f, indent=2, ensure_ascii=False) logger.info(f"Chinese TTS configuration saved to {self.config_file}") except IOError as e: logger.error(f"Failed to save Chinese TTS configuration: {e}") def get_voices_list(self) -> List[str]: """Get list of available Chinese voices""" return list(CHINESE_VOICES.keys()) def get_voice_info(self, voice_name: str) -> Optional[Dict[str, Any]]: """Get information about a specific voice""" return CHINESE_VOICES.get(voice_name) def ensure_voices_directory(self): """Ensure Chinese voices directory exists""" self.chinese_voices_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Chinese voices directory ready: {self.chinese_voices_dir}") def validate_chinese_model(self, model_path: str) -> bool: """Validate Chinese model file""" model_file = Path(model_path).resolve() if not model_file.exists(): logger.error(f"Chinese model file not found: {model_file}") return False # Basic file size check (should be > 100MB) if model_file.stat().st_size < 100 * 1024 * 1024: logger.warning(f"Model file size seems too small: {model_file.stat().st_size}") return True # Global configuration instance for Chinese TTS chinese_config = ChineseTTSConfig() # Convenience functions def get_chinese_config(key: str, default: Any = None) -> Any: """Get Chinese TTS configuration value""" return chinese_config.get(key, default) def get_chinese_voices() -> List[str]: """Get list of available Chinese voices""" voices_dir = chinese_config.chinese_voices_dir if not voices_dir.exists(): return [] available_voice_names = {voice_path.stem for voice_path in voices_dir.glob("*.pt")} return [voice_name for voice_name in CHINESE_VOICES if voice_name in available_voice_names] def get_chinese_voice_info(voice_name: str) -> Optional[Dict[str, Any]]: """Get information about a specific Chinese voice""" return CHINESE_VOICES.get(voice_name) def is_chinese_text(text: str) -> bool: """Check if text is in Chinese""" return ChineseTextProcessor.is_chinese(text) def normalize_chinese(text: str) -> str: """Normalize Chinese text""" return ChineseTextProcessor.normalize_chinese_text(text) def split_chinese_text(text: str, max_length: int = 100) -> List[str]: """Split Chinese text into segments""" return ChineseTextProcessor.split_chinese_text(text, max_length) ================================================ FILE: chinese_tts_demo.py ================================================ """ Chinese TTS Demo - Interactive CLI for Kokoro Chinese TTS Model ================================================================ This script provides an interactive command-line interface for the Kokoro-v1.1-zh Chinese TTS model. It handles Chinese-specific text processing and voice selection. Usage: python chinese_tts_demo.py Requirements: - kokoro-v1_1-zh.pth model file - Chinese voice files in voices/ directory - All dependencies from requirements.txt """ import torch import os import sys import time import logging from pathlib import Path from typing import Optional, List, Tuple, Union import soundfile as sf import numpy as np # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Import from local modules from models import build_model, generate_speech, EnhancedKPipeline from chinese_config import ( ChineseTextProcessor, ChineseTTSConfig, CHINESE_VOICES, get_chinese_voices, get_chinese_voice_info ) from config import TTSConfig # Constants DEFAULT_CHINESE_MODEL = "kokoro-v1_1-zh.pth" DEFAULT_CHINESE_OUTPUT = "output_chinese.wav" SAMPLE_RATE = 24000 MIN_SPEED = 0.5 MAX_SPEED = 2.0 DEFAULT_SPEED = 1.0 # Sample Chinese texts for testing SAMPLE_CHINESE_TEXTS = { "1": { "title": "北风与太阳 (The North Wind and the Sun)", "text": "你当旅行者裹着温暖的斗篷走来时,北风和太阳之争更强。他们同意,一个首先成功使旅行者脱下斗篷的人应该被认为比另一个更强大。然后,北风吹得尽力而为,但吹得越厉害,旅行者就越披上斗篷。最后,北风放弃了这一尝试。然后,太阳温暖地照耀着,旅行者立刻脱下了斗篷。因此,北风不得不承认太阳是两者中最强的一个。" }, "2": { "title": "简短测试 (Short Test)", "text": "你好,这是一个中文文本转语音测试。" }, "3": { "title": "自定义输入 (Custom Input)", "text": None # Will be entered by user } } def print_chinese_header(): """Print application header""" print("\n" + "="*60) print(" Kokoro-82M-v1.1 Chinese TTS Demo") print(" 科克罗中文文本转语音演示") print("="*60 + "\n") def print_menu(): """Print the main menu options in Chinese""" print("\n" + "-"*40) print(" 主菜单 (Main Menu)") print("-"*40) print("1. 列出可用声音 (List available voices)") print("2. 生成语音 (Generate speech)") print("3. 从样本文本生成 (Generate from sample text)") print("4. 帮助 (Help)") print("5. 退出 (Exit)") print("-"*40) return input("请选择一个选项 (Select an option) (1-5): ").strip() def print_help(): """Print help information""" print("\n" + "="*60) print("帮助信息 (Help Information)") print("="*60) print(""" 关于本程序 (About this program): 这是一个中文TTS演示程序,使用Kokoro-82M-v1.1中文模型。 This is a Chinese TTS demo using the Kokoro-82M-v1.1 Chinese model. 功能 (Features): - 支持8个中文女性和男性声音 (Supports 8 Chinese female and male voices) - 可调节语速 (Adjustable speech speed) - 支持自定义和预设文本 (Supports custom and preset texts) - 自动文本处理和分割 (Automatic text processing and segmentation) 声音列表 (Voice List): 女性声音 (Female voices): zf_xiaobei - 晓蓓 (Young, energetic) zf_xiaoni - 晓妮 (Clear, friendly) zf_xiaoxiao - 晓晓 (Soft, gentle) zf_xiaoyi - 晓艺 (Professional, articulate) 男性声音 (Male voices): zm_yunjian - 云健 (Strong, confident) zm_yunxi - 云析 (Warm, professional) zm_yunxia - 云夏 (Calm, steady) zm_yunyang - 云阳 (Resonant, deep) 常见问题 (FAQ): Q: 提示"字数不匹配" (Word count mismatch warning)? A: 这通常是因为英文音素化器被用于中文文本。 请确保使用正确的中文模型和配置。 Q: 生成的音频质量不好? A: 尝试调整语速,使用不同的声音。 确保模型和声音文件完整。 """) print("="*60 + "\n") def list_chinese_voices(): """List all available Chinese voices with details""" print("\n" + "-"*60) print("可用声音 (Available Chinese Voices)") print("-"*60) voices = get_chinese_voices() # Organize by gender female_voices = [v for v in voices if v.startswith('zf_')] male_voices = [v for v in voices if v.startswith('zm_')] print("\n女性声音 (Female Voices):") for i, voice in enumerate(female_voices, 1): info = get_chinese_voice_info(voice) print(f" {i}. {voice} - {info['name']} ({info['description']})") print("\n男性声音 (Male Voices):") for i, voice in enumerate(male_voices, 1): info = get_chinese_voice_info(voice) print(f" {i+len(female_voices)}. {voice} - {info['name']} ({info['description']})") print("-"*60 + "\n") def select_voice(voices: List[str]) -> str: """Interactive voice selection""" print("\n可用声音 (Available voices):") for i, voice in enumerate(voices, 1): info = get_chinese_voice_info(voice) print(f"{i}. {voice} - {info['name']} ({info['description']})") while True: try: choice = input("\n请选择一个声音编号 (Select a voice number) (or press Enter for 'zf_xiaobei'): ").strip() if not choice: return "zf_xiaobei" choice = int(choice) if 1 <= choice <= len(voices): return voices[choice - 1] print(f"无效选择。请输入1到{len(voices)}之间的数字。(Invalid choice. Please try again.)") except ValueError: print("请输入有效的数字。(Please enter a valid number.)") def get_chinese_text_input() -> str: """Get Chinese text input from user""" print("\n请输入要转换为语音的中文文本") print("(Enter the Chinese text you want to convert to speech)") print("(or press Enter to exit)") text = input("> ").strip() return text def get_speech_speed() -> float: """Get speech speed from user""" while True: try: speed = input(f"\n请输入语速 (Enter speech speed) ({MIN_SPEED}-{MAX_SPEED}, default {DEFAULT_SPEED}): ").strip() if not speed: return DEFAULT_SPEED speed = float(speed) if MIN_SPEED <= speed <= MAX_SPEED: return speed print(f"语速必须在 {MIN_SPEED} 和 {MAX_SPEED} 之间。(Speed must be between {MIN_SPEED} and {MAX_SPEED})") except ValueError: print("请输入有效的数字。(Please enter a valid number.)") def select_sample_text() -> Optional[str]: """Select from predefined sample texts""" print("\n选择样本文本 (Select sample text):") for key, sample in SAMPLE_CHINESE_TEXTS.items(): print(f"{key}. {sample['title']}") if sample["text"]: print(f" {sample['text'][:50]}...") choice = input("\n请选择 (Select): ").strip() if choice in SAMPLE_CHINESE_TEXTS: if SAMPLE_CHINESE_TEXTS[choice]["text"]: return SAMPLE_CHINESE_TEXTS[choice]["text"] else: # Custom input option return get_chinese_text_input() return None def load_chinese_model(model_path: str, device: str) -> EnhancedKPipeline: """Load the Chinese TTS model Args: model_path: Path to the Chinese model file device: Device to use ('cuda' or 'cpu') Returns: EnhancedKPipeline instance configured for Chinese """ try: # Check if model file exists model_file = Path(model_path).resolve() if not model_file.exists(): print(f"错误: 找不到模型文件 (Error: Model file not found): {model_file}") print(f"请确保您已下载 {DEFAULT_CHINESE_MODEL}") raise FileNotFoundError(f"Chinese model not found: {model_file}") # Build model with Chinese language code logger.info(f"加载中文模型 (Loading Chinese model): {model_path}") # Import build_model to use with Chinese config from models import build_model # We'll use language code 'z' for Chinese (Mandarin) # Create a custom pipeline for Chinese pipeline = build_model(model_path, device, repo_version="main", lang_code='z') logger.info("中文模型加载成功 (Chinese model loaded successfully)") return pipeline except Exception as e: logger.error(f"加载中文模型时出错 (Error loading Chinese model): {e}") raise def generate_chinese_speech( model: EnhancedKPipeline, text: str, voice: str, device: str = 'cpu', speed: float = 1.0 ) -> Tuple[Optional[np.ndarray], Optional[str]]: """Generate speech for Chinese text Args: model: EnhancedKPipeline instance text: Chinese text to synthesize voice: Voice name (e.g., 'zf_xiaobei') device: Device to use speed: Speech speed multiplier Returns: Tuple of (audio_data, phonemes) or (None, None) on error """ try: # Check if text contains Chinese characters if not ChineseTextProcessor.is_chinese(text): print("警告: 文本可能不是中文 (Warning: Text may not be Chinese)") # Normalize Chinese text text = ChineseTextProcessor.normalize_chinese_text(text) logger.info(f"已规范化文本 (Normalized text): {text[:50]}...") # Generate speech logger.info(f"生成语音... (Generating speech...)") print(f" 文本: {text[:100]}{'...' if len(text) > 100 else ''}") print(f" 声音: {voice}") print(f" 语速: {speed}x") # Load voice file voice_path = Path("voices").resolve() / f"{voice}.pt" if not voice_path.exists(): print(f"错误: 找不到声音文件 (Error: Voice file not found): {voice_path}") return None, None # Generate using the model audio_segments = [] all_phonemes = [] try: generator = model( text, voice=str(voice_path), speed=speed, split_pattern=r'\n+' ) for gs, ps, audio in generator: if audio is not None: # Convert to numpy if needed if isinstance(audio, torch.Tensor): audio = audio.detach().cpu().numpy() audio_segments.append(audio) all_phonemes.append(ps) logger.info(f"生成了句段: {gs} (Generated segment: {gs})") # Concatenate all audio segments if audio_segments: if len(audio_segments) == 1: final_audio = audio_segments[0] else: final_audio = np.concatenate(audio_segments, axis=0) all_phonemes_str = " ".join(all_phonemes) if all_phonemes else "" return final_audio, all_phonemes_str else: print("错误: 没有生成音频 (Error: No audio was generated)") return None, None except Exception as e: logger.error(f"生成过程中出错 (Error during generation): {e}") import traceback traceback.print_exc() return None, None except Exception as e: logger.error(f"生成语音时出错 (Error generating speech): {e}") import traceback traceback.print_exc() return None, None def save_audio(audio_data: np.ndarray, output_path: str = DEFAULT_CHINESE_OUTPUT) -> bool: """Save generated audio to file Args: audio_data: Audio data as numpy array output_path: Path to save the audio file Returns: True if successful, False otherwise """ try: output_path = Path(output_path).resolve() output_path.parent.mkdir(parents=True, exist_ok=True) # Remove existing file if it exists if output_path.exists(): output_path.unlink() logger.info(f"保存音频到 (Saving audio to): {output_path}") sf.write(str(output_path), audio_data, SAMPLE_RATE) print(f"✓ 音频已保存 (Audio saved to): {output_path}") return True except Exception as e: logger.error(f"保存音频时出错 (Error saving audio): {e}") print(f"✗ 无法保存音频 (Failed to save audio): {e}") return False def main(): """Main application loop""" print_chinese_header() try: # Set up device device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"使用设备 (Using device): {device}\n") # Load model print("初始化模型 (Initializing model)...") model = load_chinese_model(DEFAULT_CHINESE_MODEL, device) print("✓ 模型已加载 (Model loaded)\n") # Get available voices voices = get_chinese_voices() if not voices: print("错误: 找不到中文声音文件 (Error: No Chinese voices found)") print(f"请确保中文声音文件在 voices/ 目录中") return # Main loop while True: choice = print_menu() if choice == "1": # List voices list_chinese_voices() elif choice == "2": # Generate speech from user input voice = select_voice(voices) text = get_chinese_text_input() if not text: print("已取消 (Cancelled)") continue speed = get_speech_speed() print("\n生成中... (Generating...)") audio, phonemes = generate_chinese_speech(model, text, voice, device, speed) if audio is not None: if save_audio(audio): print("✓ 完成 (Done)") else: print("✗ 保存失败 (Save failed)") else: print("✗ 生成失败 (Generation failed)") elif choice == "3": # Generate from sample text text = select_sample_text() if text: voice = select_voice(voices) speed = get_speech_speed() print("\n生成中... (Generating...)") audio, phonemes = generate_chinese_speech(model, text, voice, device, speed) if audio is not None: if save_audio(audio): print("✓ 完成 (Done)") else: print("✗ 保存失败 (Save failed)") else: print("✗ 生成失败 (Generation failed)") elif choice == "4": # Help print_help() elif choice == "5": # Exit print("\n再见!(Goodbye!)") break else: print("无效选择。请重试。(Invalid choice. Please try again.)") except KeyboardInterrupt: print("\n\n用户中断 (User interrupted)") except Exception as e: logger.error(f"应用程序错误 (Application error): {e}") import traceback traceback.print_exc() finally: print("\n程序结束 (Program ended)") if torch.cuda.is_available(): torch.cuda.empty_cache() if __name__ == "__main__": main() ================================================ FILE: config.py ================================================ """ Centralized Configuration System for Kokoro TTS Local ---------------------------------------------------- This module provides centralized configuration management for all components of the Kokoro TTS Local application. """ import os import json from pathlib import Path from typing import Dict, Any, Optional import logging logger = logging.getLogger(__name__) class TTSConfig: """Centralized configuration manager for TTS application""" def __init__(self, config_file: Optional[str] = None): self.config_file = Path(config_file or "tts_config.json").resolve() self._config = self._load_default_config() self._load_config_file() def _load_default_config(self) -> Dict[str, Any]: """Load default configuration values""" return { # Audio settings "audio": { "sample_rate": 24000, "max_text_length_cli": 10000, "max_text_length_web": 5000, "min_speed": 0.1, "max_speed": 3.0, "default_speed": 1.0, "supported_formats": ["wav", "mp3", "aac"] }, # Model settings "model": { "default_model_path": "kokoro-v1_0.pth", "repo_id": "hexgrad/Kokoro-82M", "repo_version": "main", "default_language": "a", "max_generation_time": 300, "min_generation_time": 60, "max_retries": 3, "retry_delay": 2 }, # Path settings "paths": { "voices_dir": "voices", "outputs_dir": "outputs", "cache_dir": ".cache", "config_file": "config.json", "speed_dial_file": "speed_dial.json" }, # Web interface settings "web": { "server_name": "0.0.0.0", "server_port": 7860, "share": False }, # CLI settings "cli": { "default_output_file": "output.wav" }, # Language codes mapping "language_codes": { 'a': 'American English', 'b': 'British English', 'j': 'Japanese', 'z': 'Mandarin Chinese', 'e': 'Spanish', 'f': 'French', 'h': 'Hindi', 'i': 'Italian', 'p': 'Brazilian Portuguese' }, # Voice files list "voice_files": [ # American English Female voices (11 voices) "af_heart.pt", "af_alloy.pt", "af_aoede.pt", "af_bella.pt", "af_jessica.pt", "af_kore.pt", "af_nicole.pt", "af_nova.pt", "af_river.pt", "af_sarah.pt", "af_sky.pt", # American English Male voices (9 voices) "am_adam.pt", "am_echo.pt", "am_eric.pt", "am_fenrir.pt", "am_liam.pt", "am_michael.pt", "am_onyx.pt", "am_puck.pt", "am_santa.pt", # British English Female voices (4 voices) "bf_alice.pt", "bf_emma.pt", "bf_isabella.pt", "bf_lily.pt", # British English Male voices (4 voices) "bm_daniel.pt", "bm_fable.pt", "bm_george.pt", "bm_lewis.pt", # Japanese voices (5 voices) "jf_alpha.pt", "jf_gongitsune.pt", "jf_nezumi.pt", "jf_tebukuro.pt", "jm_kumo.pt", # Mandarin Chinese voices (8 voices) "zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt", "zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt", # Spanish voices (3 voices) "ef_dora.pt", "em_alex.pt", "em_santa.pt", # French voices (1 voice) "ff_siwis.pt", # Hindi voices (4 voices) "hf_alpha.pt", "hf_beta.pt", "hm_omega.pt", "hm_psi.pt", # Italian voices (2 voices) "if_sara.pt", "im_nicola.pt", # Brazilian Portuguese voices (3 voices) "pf_dora.pt", "pm_alex.pt", "pm_santa.pt" ] } def _load_config_file(self): """Load configuration from file if it exists""" if self.config_file.exists(): try: with open(self.config_file, 'r', encoding='utf-8') as f: file_config = json.load(f) self._merge_config(file_config) logger.info(f"Loaded configuration from {self.config_file}") except (json.JSONDecodeError, IOError) as e: logger.warning(f"Failed to load config file {self.config_file}: {e}") def _merge_config(self, file_config: Dict[str, Any]): """Merge file configuration with default configuration""" def merge_dict(default: Dict, override: Dict): for key, value in override.items(): if key in default and isinstance(default[key], dict) and isinstance(value, dict): merge_dict(default[key], value) else: default[key] = value merge_dict(self._config, file_config) def get(self, key: str, default: Any = None) -> Any: """Get configuration value using dot notation (e.g., 'audio.sample_rate')""" keys = key.split('.') value = self._config for k in keys: if isinstance(value, dict) and k in value: value = value[k] else: return default return value def set(self, key: str, value: Any): """Set configuration value using dot notation""" keys = key.split('.') config = self._config for k in keys[:-1]: if k not in config: config[k] = {} config = config[k] config[keys[-1]] = value def save(self): """Save current configuration to file""" try: self.config_file.parent.mkdir(parents=True, exist_ok=True) with open(self.config_file, 'w', encoding='utf-8') as f: json.dump(self._config, f, indent=2, ensure_ascii=False) logger.info(f"Configuration saved to {self.config_file}") except IOError as e: logger.error(f"Failed to save configuration: {e}") def get_path(self, path_key: str) -> Path: """Get a path from configuration and return as resolved Path object""" path_str = self.get(f"paths.{path_key}") if path_str: return Path(path_str).resolve() raise ValueError(f"Path key '{path_key}' not found in configuration") def validate_sample_rate(self, rate: int) -> int: """Validate and normalize sample rate to acceptable values Returns the rate if valid, otherwise returns the default sample rate. """ valid_rates = [16000, 22050, 24000, 44100, 48000] if rate not in valid_rates: default_rate = self.get("audio.sample_rate", 24000) logger.warning( f"Invalid sample rate {rate}. Valid rates are {valid_rates}. " f"Using default rate: {default_rate}" ) return default_rate return rate def validate_language(self, lang: str) -> str: """Validate language code""" valid_langs = list(self.get("language_codes", {}).keys()) if lang not in valid_langs: logger.warning(f"Invalid language code '{lang}'. Using default.") logger.info(f"Supported language codes: {', '.join(valid_langs)}") return self.get("model.default_language", "a") return lang def validate_speed(self, speed: float) -> float: """Validate speech speed is within acceptable range""" min_speed = self.get("audio.min_speed", 0.1) max_speed = self.get("audio.max_speed", 3.0) if speed < min_speed: logger.warning(f"Speed {speed} too low, using minimum {min_speed}") return min_speed elif speed > max_speed: logger.warning(f"Speed {speed} too high, using maximum {max_speed}") return max_speed return speed # Global configuration instance config = TTSConfig() # Convenience functions for backward compatibility def get_config(key: str, default: Any = None) -> Any: """Get configuration value""" return config.get(key, default) def set_config(key: str, value: Any): """Set configuration value""" config.set(key, value) def save_config(): """Save configuration to file""" config.save() def get_path(path_key: str) -> Path: """Get a path from configuration""" return config.get_path(path_key) ================================================ FILE: dependency_checker.py ================================================ """ Dependency Version Checker for Kokoro TTS Local ---------------------------------------------- This module checks if all required dependencies are installed and compatible. """ import sys import importlib import subprocess from typing import Any, Dict, List, Tuple, Optional from packaging import version import logging logger = logging.getLogger(__name__) # Required dependencies with minimum versions REQUIRED_DEPENDENCIES = { 'torch': '1.9.0', 'kokoro': '0.9.2', 'gradio': '3.0.0', 'soundfile': '0.10.0', 'huggingface_hub': '0.10.0', 'pydub': '0.25.0', 'numpy': '1.19.0', 'pathlib': None, # Built-in module 'tqdm': '4.60.0' } # Optional dependencies OPTIONAL_DEPENDENCIES = { 'espeakng_loader': '0.1.0', 'phonemizer': '3.0.0', 'misaki': '0.1.0', 'spacy': '3.0.0', 'num2words': '0.5.0' } class DependencyChecker: """Check and validate dependencies""" def __init__(self): self.missing_required = [] self.missing_optional = [] self.version_conflicts = [] self.warnings = [] def check_python_version(self) -> bool: """Check if Python version is compatible""" min_python = (3, 8) current_python = sys.version_info[:2] if current_python < min_python: logger.error(f"Python {min_python[0]}.{min_python[1]}+ required, but {current_python[0]}.{current_python[1]} found") return False logger.info(f"Python version {current_python[0]}.{current_python[1]} is compatible") return True def get_package_version(self, package_name: str) -> Optional[str]: """Get installed version of a package""" try: module = importlib.import_module(package_name) # Try different version attributes for attr in ['__version__', 'version', 'VERSION']: if hasattr(module, attr): return getattr(module, attr) # For some packages, try getting version via pip try: result = subprocess.run( [sys.executable, '-m', 'pip', 'show', package_name], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: for line in result.stdout.split('\n'): if line.startswith('Version:'): return line.split(':', 1)[1].strip() except (subprocess.TimeoutExpired, subprocess.SubprocessError): pass return "unknown" except ImportError: return None def check_dependency(self, package_name: str, min_version: Optional[str]) -> Tuple[bool, str]: """Check if a dependency is installed and meets version requirements""" installed_version = self.get_package_version(package_name) if installed_version is None: return False, f"{package_name} is not installed" if min_version is None: return True, f"{package_name} is installed (version: {installed_version})" try: if installed_version == "unknown": self.warnings.append(f"Could not determine version of {package_name}") return True, f"{package_name} is installed (version: unknown)" if version.parse(installed_version) >= version.parse(min_version): return True, f"{package_name} {installed_version} meets requirement (>= {min_version})" else: return False, f"{package_name} {installed_version} is too old (>= {min_version} required)" except Exception as e: self.warnings.append(f"Error checking version of {package_name}: {e}") return True, f"{package_name} is installed but version check failed" def check_all_dependencies(self) -> bool: """Check all required and optional dependencies""" logger.info("Checking dependencies...") # Check Python version first if not self.check_python_version(): return False all_good = True # Check required dependencies logger.info("Checking required dependencies...") for package, min_ver in REQUIRED_DEPENDENCIES.items(): is_ok, message = self.check_dependency(package, min_ver) if is_ok: logger.info(f"✓ {message}") else: logger.error(f"✗ {message}") self.missing_required.append(package) all_good = False # Check optional dependencies logger.info("Checking optional dependencies...") for package, min_ver in OPTIONAL_DEPENDENCIES.items(): is_ok, message = self.check_dependency(package, min_ver) if is_ok: logger.info(f"✓ {message}") else: logger.warning(f"○ {message} (optional)") self.missing_optional.append(package) # Report warnings for warning in self.warnings: logger.warning(warning) return all_good def get_installation_commands(self) -> List[str]: """Get pip install commands for missing dependencies""" commands = [] if self.missing_required: required_packages = [] for package in self.missing_required: min_ver = REQUIRED_DEPENDENCIES.get(package) if min_ver: required_packages.append(f"{package}>={min_ver}") else: required_packages.append(package) if required_packages: commands.append(f"pip install {' '.join(required_packages)}") if self.missing_optional: optional_packages = [] for package in self.missing_optional: min_ver = OPTIONAL_DEPENDENCIES.get(package) if min_ver: optional_packages.append(f"{package}>={min_ver}") else: optional_packages.append(package) if optional_packages: commands.append(f"pip install {' '.join(optional_packages)} # Optional") return commands def check_cuda_availability(self) -> Dict[str, Any]: """Check CUDA availability and provide information""" cuda_info = { 'available': False, 'version': None, 'device_count': 0, 'devices': [] } try: import torch cuda_info['available'] = torch.cuda.is_available() if cuda_info['available']: cuda_info['version'] = torch.version.cuda cuda_info['device_count'] = torch.cuda.device_count() for i in range(cuda_info['device_count']): device_props = torch.cuda.get_device_properties(i) cuda_info['devices'].append({ 'id': i, 'name': device_props.name, 'memory': device_props.total_memory // (1024**3) # GB }) logger.info(f"CUDA {cuda_info['version']} available with {cuda_info['device_count']} device(s)") for device in cuda_info['devices']: logger.info(f" Device {device['id']}: {device['name']} ({device['memory']}GB)") else: logger.info("CUDA not available, will use CPU") except Exception as e: logger.warning(f"Error checking CUDA availability: {e}") return cuda_info def check_dependencies() -> bool: """Main function to check all dependencies""" checker = DependencyChecker() # Check dependencies all_good = checker.check_all_dependencies() # Check CUDA cuda_info = checker.check_cuda_availability() # Print summary if not all_good: logger.error("Some required dependencies are missing or incompatible!") logger.info("To install missing dependencies, run:") for cmd in checker.get_installation_commands(): logger.info(f" {cmd}") return False if checker.missing_optional: logger.info("Some optional dependencies are missing. The application will work but some features may be disabled.") logger.info("To install optional dependencies, run:") for cmd in checker.get_installation_commands(): if "Optional" in cmd: logger.info(f" {cmd}") logger.info("All required dependencies are satisfied!") return True if __name__ == "__main__": # Configure logging for standalone execution logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') success = check_dependencies() sys.exit(0 if success else 1) ================================================ FILE: docker-compose.yml ================================================ services: kokoro-tts: build: context: . dockerfile: Dockerfile container_name: kokoro-tts-local ports: - "7860:7860" environment: # Uncomment the line below to run fully offline after initial model download # - HF_HUB_OFFLINE=1 volumes: - ./outputs:/app/outputs - ./voices:/app/voices - ./.cache:/app/.cache restart: unless-stopped ================================================ FILE: gradio_interface.py ================================================ """ Kokoro-TTS Local Generator ------------------------- A Gradio interface for the Kokoro-TTS-Local text-to-speech system. Supports multiple voices and audio formats, with cross-platform compatibility. Key Features: - Multiple voice models support (54 voices across 8 languages) - Real-time generation with progress logging - WAV, MP3, and AAC output formats - Network sharing capabilities - Cross-platform compatibility (Windows, macOS, Linux) Dependencies: - kokoro: Official Kokoro TTS library - gradio: Web interface framework - soundfile: Audio file handling - pydub: Audio format conversion """ import gradio as gr import os import sys import platform from datetime import datetime import shutil from pathlib import Path import soundfile as sf from pydub import AudioSegment import torch import numpy as np import argparse from typing import Union, List, Optional, Tuple, Dict, Any from models import ( list_available_voices, build_model, generate_speech, download_voice_files, EnhancedKPipeline ) import speed_dial # Constants MAX_TEXT_LENGTH = 5000 DEFAULT_SAMPLE_RATE = 24000 MIN_SPEED = 0.1 MAX_SPEED = 3.0 DEFAULT_SPEED = 1.0 # Define path type for consistent handling PathLike = Union[str, Path] # Configuration validation def validate_sample_rate(rate: int) -> int: """Validate sample rate is within acceptable range""" valid_rates = [16000, 22050, 24000, 44100, 48000] if rate not in valid_rates: print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}") return 24000 # Default to safe value return rate # Global configuration CONFIG_FILE = Path("tts_config.json") # Stores user preferences and paths DEFAULT_OUTPUT_DIR = Path("outputs") # Directory for generated audio files SAMPLE_RATE = validate_sample_rate(24000) # Validated sample rate # Initialize model globally device = 'cuda' if torch.cuda.is_available() else 'cpu' model = None LANG_MAP = { "af_": "a", "am_": "a", "bf_": "b", "bm_": "b", "jf_": "j", "jm_": "j", "zf_": "z", "zm_": "z", "ef_": "e", "em_": "e", "ff_": "f", "hf_": "h", "hm_": "h", "if_": "i", "im_": "i", "pf_": "p", "pm_": "p", } pipelines = {} def get_available_voices(): """Get list of available voice models.""" try: # Initialize model to trigger voice downloads global model if model is None: print("Initializing model and downloading voices...") model = build_model(None, device) voices = list_available_voices() if not voices: print("No voices found after initialization. Attempting to download...") download_voice_files() # Try downloading again voices = list_available_voices() print("Available voices:", voices) return voices except Exception as e: print(f"Error getting voices: {e}") return [] def get_pipeline_for_voice(voice_name: str) -> EnhancedKPipeline: """ Determine the language code from the voice prefix and return the associated pipeline. """ prefix = voice_name[:3].lower() lang_code = LANG_MAP.get(prefix, "a") if lang_code not in pipelines: print(f"[INFO] Creating pipeline for lang_code='{lang_code}'") pipelines[lang_code] = build_model(None, device, lang_code=lang_code) pipelines[lang_code].device = device return pipelines[lang_code] def convert_audio(input_path: PathLike, output_path: PathLike, format: str) -> Optional[PathLike]: """Convert audio to specified format. Args: input_path: Path to input audio file output_path: Path to output audio file format: Output format ('wav', 'mp3', or 'aac') Returns: Path to output file or None on error """ try: # Normalize paths input_path = Path(input_path).resolve() output_path = Path(output_path).resolve() # Validate input file if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_path}") # For WAV format, just return the input path if format.lower() == "wav": return input_path # Create output directory if it doesn't exist output_path.parent.mkdir(parents=True, exist_ok=True) # Convert format audio = AudioSegment.from_wav(str(input_path)) # Select proper format and options if format.lower() == "mp3": audio.export(str(output_path), format="mp3", bitrate="192k") elif format.lower() == "aac": audio.export(str(output_path), format="aac", bitrate="192k") else: raise ValueError(f"Unsupported format: {format}") # Verify file was created if not output_path.exists() or output_path.stat().st_size == 0: raise IOError(f"Failed to create {format} file") return output_path except (IOError, FileNotFoundError, ValueError) as e: print(f"Error converting audio: {type(e).__name__}: {e}") return None except Exception as e: print(f"Unexpected error converting audio: {type(e).__name__}: {e}") import traceback traceback.print_exc() return None def generate_tts_with_logs(voice_name: str, text: str, format: str, speed: float = 1.0) -> Optional[PathLike]: """Generate TTS audio with progress logging and memory management. Args: voice_name: Name of the voice to use text: Text to convert to speech format: Output format ('wav', 'mp3', 'aac') Returns: Path to generated audio file or None on error """ global model import psutil import gc try: # Check available memory before processing memory = psutil.virtual_memory() available_gb = memory.available / (1024**3) if available_gb < 1.0: # Less than 1GB available print(f"Warning: Low memory available ({available_gb:.1f}GB). Consider closing other applications.") # Force garbage collection gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() # Initialize model if needed if model is None: print("Initializing model...") model = build_model(None, device) # Create output directory DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Validate input text if not text or not text.strip(): raise ValueError("Text input cannot be empty") # Dynamic text length limit based on available memory MAX_CHARS = MAX_TEXT_LENGTH if available_gb < 2.0: # Less than 2GB available MAX_CHARS = min(MAX_CHARS, 2000) # Reduce limit for low memory print(f"Reduced text limit to {MAX_CHARS} characters due to low memory") if len(text) > MAX_CHARS: print(f"Warning: Text exceeds {MAX_CHARS} characters. Truncating to prevent memory issues.") text = text[:MAX_CHARS] + "..." # Generate base filename from text timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") base_name = f"tts_{timestamp}" wav_path = DEFAULT_OUTPUT_DIR / f"{base_name}.wav" # Generate speech print(f"\nGenerating speech for: '{text}'") print(f"Using voice: {voice_name}") # Validate voice path using Path for consistent handling voice_path = Path("voices").resolve() / f"{voice_name}.pt" if not voice_path.exists(): raise FileNotFoundError(f"Voice file not found: {voice_path}") try: if voice_name.startswith(tuple(LANG_MAP.keys())): pipeline = get_pipeline_for_voice(voice_name) generator = pipeline(text, voice=str(voice_path), speed=speed, split_pattern=r'\n+') else: generator = model(text, voice=str(voice_path), speed=speed, split_pattern=r'\n+') all_audio = [] max_segments = 100 # Safety limit for very long texts segment_count = 0 for gs, ps, audio in generator: segment_count += 1 if segment_count > max_segments: print(f"Warning: Reached maximum segment limit ({max_segments})") break if audio is not None: if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio).float() all_audio.append(audio) print(f"Generated segment: {gs}") if ps: # Only print phonemes if available print(f"Phonemes: {ps}") if not all_audio: raise Exception("No audio generated") except Exception as e: raise Exception(f"Error in speech generation: {e}") # Combine audio segments and save if not all_audio: raise Exception("No audio segments were generated") # Handle single segment case without concatenation if len(all_audio) == 1: final_audio = all_audio[0] else: try: final_audio = torch.cat(all_audio, dim=0) except RuntimeError as e: raise Exception(f"Failed to concatenate audio segments: {e}") # Save audio file try: if isinstance(final_audio, torch.Tensor): final_audio = final_audio.detach().cpu().numpy() sf.write(wav_path, final_audio, SAMPLE_RATE) except Exception as e: raise Exception(f"Failed to save audio file: {e}") # Convert to requested format if needed if format.lower() != "wav": output_path = DEFAULT_OUTPUT_DIR / f"{base_name}.{format.lower()}" return convert_audio(wav_path, output_path, format.lower()) return wav_path except Exception as e: print(f"Error generating speech: {e}") import traceback traceback.print_exc() return None def create_interface(server_name="127.0.0.1", server_port=7860): """Create and launch the Gradio interface.""" # Get available voices voices = get_available_voices() if not voices: print("No voices found! Please check the voices directory.") return # Get speed dial presets preset_names = speed_dial.get_preset_names() # Create interface with gr.Blocks(title="Kokoro TTS Generator", fill_height=True) as interface: gr.Markdown("# Kokoro TTS Generator") with gr.Row(): with gr.Column(scale=2): gr.Markdown("## TTS Controls") with gr.Column(scale=1): gr.Markdown("## Speed Dial") with gr.Row(equal_height=True): with gr.Column(scale=2): # Main TTS controls voice = gr.Dropdown( choices=voices, value=voices[0] if voices else None, label="Voice" ) text = gr.Textbox( lines=3, placeholder="Enter text to convert to speech...", label="Text" ) with gr.Column(scale=1): # Speed dial section preset_dropdown = gr.Dropdown( choices=preset_names, value=preset_names[0] if preset_names else None, label="Saved Presets", interactive=True ) preset_name = gr.Textbox( placeholder="Enter preset name...", label="New Preset Name" ) with gr.Row(equal_height=True): with gr.Column(scale=2): with gr.Row(): format = gr.Radio( choices=["wav", "mp3", "aac"], value="wav", label="Output Format" ) speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed" ) with gr.Column(scale=1): load_preset = gr.Button("Load") save_preset = gr.Button("Save Current") with gr.Row(): with gr.Column(scale=2): generate = gr.Button("Generate Speech") with gr.Column(scale=1): delete_preset = gr.Button("Delete") with gr.Row(): # Output section output = gr.Audio(label="Generated Audio") # Function to load a preset def load_preset_fn(preset_name): if not preset_name: return None, None, None, None preset = speed_dial.get_preset(preset_name) if not preset: return None, None, None, None return preset["voice"], preset["text"], preset["format"], preset["speed"] # Function to save a preset def save_preset_fn(name, voice, text, format, speed): if not name or not voice or not text: return gr.update(value="Please provide a name, voice, and text") success = speed_dial.save_preset(name, voice, text, format, speed) # Update the dropdown with the new preset list preset_names = speed_dial.get_preset_names() if success: return gr.update(choices=preset_names, value=name) else: return gr.update(choices=preset_names) # Function to delete a preset def delete_preset_fn(name): if not name: return gr.update(value="Please select a preset to delete") success = speed_dial.delete_preset(name) # Update the dropdown with the new preset list preset_names = speed_dial.get_preset_names() if success: return gr.update(choices=preset_names, value=None) else: return gr.update(choices=preset_names) # Connect the buttons to their functions load_preset.click( fn=load_preset_fn, inputs=preset_dropdown, outputs=[voice, text, format, speed] ) save_preset.click( fn=save_preset_fn, inputs=[preset_name, voice, text, format, speed], outputs=preset_dropdown ) delete_preset.click( fn=delete_preset_fn, inputs=preset_dropdown, outputs=preset_dropdown ) # Connect the generate button generate.click( fn=generate_tts_with_logs, inputs=[voice, text, format, speed], outputs=output ) # Launch interface interface.launch( server_name=server_name, server_port=server_port, share=False ) def cleanup_resources(): """Properly clean up resources when the application exits""" global model try: print("Cleaning up resources...") # Clean up model resources if model is not None: print("Releasing model resources...") # Clear voice dictionary to release memory if hasattr(model, 'voices') and model.voices is not None: try: voice_count = len(model.voices) for voice_name in list(model.voices.keys()): try: # Release each voice explicitly model.voices[voice_name] = None except: pass model.voices.clear() print(f"Cleared {voice_count} voice references") except Exception as ve: print(f"Error clearing voices: {type(ve).__name__}: {ve}") # Clear model attributes that might hold tensors for attr_name in dir(model): if not attr_name.startswith('__') and hasattr(model, attr_name): try: attr = getattr(model, attr_name) # Handle specific tensor attributes if isinstance(attr, torch.Tensor): if attr.is_cuda: print(f"Releasing CUDA tensor: {attr_name}") setattr(model, attr_name, None) elif hasattr(attr, 'to'): # Module or Tensor-like object setattr(model, attr_name, None) except: pass # Delete model reference try: del model model = None print("Model reference deleted") except Exception as me: print(f"Error deleting model: {type(me).__name__}: {me}") # Clear CUDA memory explicitly if torch.cuda.is_available(): try: # Get initial memory usage try: initial = torch.cuda.memory_allocated() initial_mb = initial / (1024 * 1024) print(f"CUDA memory before cleanup: {initial_mb:.2f} MB") except: pass # Free memory print("Clearing CUDA cache...") torch.cuda.empty_cache() # Force synchronization try: torch.cuda.synchronize() except: pass # Get final memory usage try: final = torch.cuda.memory_allocated() final_mb = final / (1024 * 1024) freed_mb = (initial - final) / (1024 * 1024) print(f"CUDA memory after cleanup: {final_mb:.2f} MB (freed {freed_mb:.2f} MB)") except: pass except Exception as ce: print(f"Error clearing CUDA memory: {type(ce).__name__}: {ce}") # Final garbage collection try: import gc collected = gc.collect() print(f"Garbage collection completed: {collected} objects collected") except Exception as gce: print(f"Error during garbage collection: {type(gce).__name__}: {gce}") print("Cleanup completed") except Exception as e: print(f"Error during cleanup: {type(e).__name__}: {e}") import traceback traceback.print_exc() # Register cleanup for normal exit import atexit atexit.register(cleanup_resources) # Register cleanup for signals import signal import sys def signal_handler(signum, frame): print(f"\nReceived signal {signum}, shutting down...") cleanup_resources() sys.exit(0) # Register for common signals for sig in [signal.SIGINT, signal.SIGTERM]: try: signal.signal(sig, signal_handler) except (ValueError, AttributeError): # Some signals might not be available on all platforms pass def parse_arguments(): """Parse command line arguments for host and port configuration.""" parser = argparse.ArgumentParser( description="Kokoro TTS Local Generator - Gradio Web Interface", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "--host", type=str, default="127.0.0.1", help="Host address to bind the server to" ) parser.add_argument( "--port", type=int, default=7860, help="Port number to run the server on" ) return parser.parse_args() if __name__ == "__main__": try: args = parse_arguments() create_interface(server_name=args.host, server_port=args.port) finally: # Ensure cleanup even if Gradio encounters an error cleanup_resources() ================================================ FILE: models.py ================================================ """Models module for Kokoro TTS Local""" from typing import Optional, Tuple, List import torch from kokoro import KPipeline import os import json import codecs from pathlib import Path import numpy as np import shutil import threading import warnings import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Suppress warnings from pre-trained model warnings.filterwarnings("ignore", message="dropout option adds dropout after all but last recurrent layer") warnings.filterwarnings("ignore", message="`torch.nn.utils.weight_norm` is deprecated") # Set environment variables for proper encoding os.environ["PYTHONIOENCODING"] = "utf-8" # Disable symlinks warning os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" # Check if offline mode is enabled via environment variable OFFLINE_MODE = os.environ.get("HF_HUB_OFFLINE", "0") == "1" or os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1" if OFFLINE_MODE: logger.info("Running in OFFLINE mode - will only use locally cached files") # Ensure the environment variable is set for the kokoro library as well os.environ["HF_HUB_OFFLINE"] = "1" os.environ["TRANSFORMERS_OFFLINE"] = "1" # Setup for safer cleanup import atexit import signal import sys # Track whether patches have been applied _patches_applied = { 'json_load': False } class EnhancedKPipeline(KPipeline): """Enhanced KPipeline with improved voice loading and error handling""" def __init__(self, lang_code: str = 'a', model: bool = True): super().__init__(lang_code=lang_code, model=model) self.device = 'cpu' # Default device if not hasattr(self, 'voices'): self.voices = {} def load_voice(self, voice_path: str) -> torch.Tensor: """Load voice model with improved error handling and path validation""" voice_path = Path(voice_path).resolve() if not voice_path.exists(): raise FileNotFoundError(f"Voice file not found: {voice_path}") voice_name = voice_path.stem try: logger.info(f"Loading voice: {voice_name} from {voice_path}") voice_model = torch.load(str(voice_path), weights_only=True, map_location='cpu') if voice_model is None: raise ValueError(f"Failed to load voice model from {voice_path}") # Move model to device and store in voices dictionary self.voices[voice_name] = voice_model.to(self.device) logger.info(f"Successfully loaded voice: {voice_name}") return self.voices[voice_name] except Exception as e: logger.error(f"Error loading voice {voice_name}: {e}") raise def _cleanup_patches() -> None: """Restore original functions that were patched""" try: if _patches_applied['json_load'] and _original_json_load is not None: restore_json_load() _patches_applied['json_load'] = False logger.info("Restored original json.load function") except Exception as e: logger.warning(f"Error restoring json.load: {e}") # Register cleanup for normal exit atexit.register(_cleanup_patches) def register_cleanup_signal_handlers() -> None: """Install process signal handlers for patch cleanup. This is opt-in to avoid import-time global signal side effects when models.py is imported by other applications. """ for sig in [signal.SIGINT, signal.SIGTERM]: try: signal.signal(sig, lambda signum, frame: ( logger.info(f"Received signal {signum}, cleaning up..."), _cleanup_patches(), sys.exit(1) )) except (ValueError, AttributeError) as e: # Some signals might not be available on all platforms logger.warning(f"Could not register signal handler: {e}") # List of available voice files (54 voices across 8 languages) VOICE_FILES = [ # American English Female voices (11 voices) "af_heart.pt", "af_alloy.pt", "af_aoede.pt", "af_bella.pt", "af_jessica.pt", "af_kore.pt", "af_nicole.pt", "af_nova.pt", "af_river.pt", "af_sarah.pt", "af_sky.pt", # American English Male voices (9 voices) "am_adam.pt", "am_echo.pt", "am_eric.pt", "am_fenrir.pt", "am_liam.pt", "am_michael.pt", "am_onyx.pt", "am_puck.pt", "am_santa.pt", # British English Female voices (4 voices) "bf_alice.pt", "bf_emma.pt", "bf_isabella.pt", "bf_lily.pt", # British English Male voices (4 voices) "bm_daniel.pt", "bm_fable.pt", "bm_george.pt", "bm_lewis.pt", # Japanese voices (5 voices) "jf_alpha.pt", "jf_gongitsune.pt", "jf_nezumi.pt", "jf_tebukuro.pt", "jm_kumo.pt", # Mandarin Chinese voices (8 voices) "zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt", "zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt", # Spanish voices (3 voices) "ef_dora.pt", "em_alex.pt", "em_santa.pt", # French voices (1 voice) "ff_siwis.pt", # Hindi voices (4 voices) "hf_alpha.pt", "hf_beta.pt", "hm_omega.pt", "hm_psi.pt", # Italian voices (2 voices) "if_sara.pt", "im_nicola.pt", # Brazilian Portuguese voices (3 voices) "pf_dora.pt", "pm_alex.pt", "pm_santa.pt" ] # Language code mapping for different languages LANGUAGE_CODES = { 'a': 'American English', 'b': 'British English', 'j': 'Japanese', 'z': 'Mandarin Chinese', 'e': 'Spanish', 'f': 'French', 'h': 'Hindi', 'i': 'Italian', 'p': 'Brazilian Portuguese' } VOICE_PREFIX_TO_LANGUAGE_CODE = { 'af': 'a', 'am': 'a', 'bf': 'b', 'bm': 'b', 'jf': 'j', 'jm': 'j', 'zf': 'z', 'zm': 'z', 'ef': 'e', 'em': 'e', 'ff': 'f', 'fm': 'f', 'hf': 'h', 'hm': 'h', 'if': 'i', 'im': 'i', 'pf': 'p', 'pm': 'p', } def patch_json_load() -> None: """Patch json.load to handle UTF-8 encoded files with special characters""" global _patches_applied, _original_json_load if _patches_applied['json_load']: return _original_json_load = json.load # Store for restoration def read_json_content(fp, encoding: str) -> str: if hasattr(fp, 'seek'): fp.seek(0) if hasattr(fp, 'buffer'): raw_content = fp.buffer.read() return raw_content.decode( encoding, errors='replace' if encoding == 'utf-8-sig' else 'strict' ).lstrip('\ufeff') content = fp.read() if isinstance(content, bytes): return content.decode( encoding, errors='replace' if encoding == 'utf-8-sig' else 'strict' ).lstrip('\ufeff') return content.lstrip('\ufeff') def custom_load(fp, *args, **kwargs): try: content = read_json_content(fp, 'utf-8') except UnicodeDecodeError: content = read_json_content(fp, 'utf-8-sig') try: return json.loads(content, *args, **kwargs) except json.JSONDecodeError as e: logger.error(f"JSON parsing error: {e}") raise json.load = custom_load _patches_applied['json_load'] = True # Store the original load function for potential restoration _original_json_load = None def restore_json_load() -> None: """Restore the original json.load function""" global _original_json_load, _patches_applied if _original_json_load is not None and _patches_applied['json_load']: json.load = _original_json_load _original_json_load = None _patches_applied['json_load'] = False def load_config(config_path: str) -> dict: """Load configuration file with proper encoding handling""" config_path = Path(config_path).resolve() try: with codecs.open(str(config_path), 'r', encoding='utf-8') as f: return json.load(f) except UnicodeDecodeError: # Fallback to utf-8-sig if regular utf-8 fails with codecs.open(str(config_path), 'r', encoding='utf-8-sig') as f: return json.load(f) # Initialize espeak-ng phonemizer_available = False # Global flag to track if phonemizer is working current_phonemizer_lang = None # Track current phonemizer language def initialize_phonemizer(language: str = 'en-us') -> bool: """Initialize phonemizer for a specific language Args: language: Language code for phonemizer (e.g., 'en-us', 'zh') Returns: True if initialization successful, False otherwise """ global phonemizer_available, current_phonemizer_lang try: from phonemizer.backend.espeak.wrapper import EspeakWrapper from phonemizer import phonemize import espeakng_loader # Make library available first library_path = espeakng_loader.get_library_path() data_path = espeakng_loader.get_data_path() espeakng_loader.make_library_available() # Set up espeak-ng paths EspeakWrapper.library_path = library_path EspeakWrapper.data_path = data_path # Verify espeak-ng is working with specified language try: test_text = 'test' if language in ['en-us', 'en-gb'] else '测试' test_phonemes = phonemize(test_text, language=language) if test_phonemes: phonemizer_available = True current_phonemizer_lang = language logger.info(f"Phonemizer successfully initialized for language: {language}") return True else: logger.warning("Phonemization returned empty result") return False except Exception as e: # Continue without espeak functionality - be more specific about error types if "espeak" in str(e).lower(): logger.warning(f"eSpeak not found: {e}") else: logger.warning(f"Phonemizer initialization error: {e}") return False except ImportError as e: logger.warning(f"Phonemizer packages not installed: {e}") logger.info("If you want phoneme visualization, manually install required packages:") logger.info("pip install espeakng-loader phonemizer-fork") return False # Initialize default English phonemizer try: initialize_phonemizer('en-us') except Exception as e: logger.warning(f"Could not initialize default phonemizer: {e}") # Initialize pipeline globally with thread safety _pipeline = None _pipeline_lock = threading.RLock() # Reentrant lock for thread safety _voice_cache_lock = threading.RLock() # Separate lock for voice cache operations _download_lock = threading.Lock() # Lock for download operations def download_voice_files(voice_files: Optional[List[str]] = None, repo_version: str = "main", required_count: int = 1) -> List[str]: """Download voice files from Hugging Face with enhanced progress tracking. Args: voice_files: Optional list of voice files to download. If None, download all VOICE_FILES. repo_version: Version/tag of the repository to use (default: "main") required_count: Minimum number of voices required (default: 1) Returns: List of successfully downloaded voice files Raises: ValueError: If fewer than required_count voices could be downloaded """ from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm import hashlib import time # Use absolute path for voices directory voices_dir = Path("voices").resolve() voices_dir.mkdir(exist_ok=True) # Import here to avoid startup dependency from huggingface_hub import hf_hub_download downloaded_voices = [] failed_voices = [] # If specific voice files are requested, use those. Otherwise use all. files_to_download = voice_files if voice_files is not None else VOICE_FILES total_files = len(files_to_download) logger.info(f"Downloading voice files... ({total_files} total files)") # Check for existing voice files first existing_files = [] for voice_file in files_to_download: voice_path = voices_dir / voice_file if voice_path.exists() and voice_path.stat().st_size > 0: logger.info(f"Voice file {voice_file} already exists") downloaded_voices.append(voice_file) existing_files.append(voice_file) # Remove existing files from the download list files_to_download = [f for f in files_to_download if f not in existing_files] if not files_to_download and downloaded_voices: logger.info(f"All required voice files already exist ({len(downloaded_voices)} files)") return downloaded_voices # In offline mode, only use existing files if OFFLINE_MODE: if not downloaded_voices: error_msg = "No voice files found locally and running in OFFLINE mode. Please download voice files first with network connection." logger.error(error_msg) raise ValueError(error_msg) elif len(downloaded_voices) < required_count: error_msg = f"Only {len(downloaded_voices)} voice files found locally, but {required_count} were required. Running in OFFLINE mode." logger.error(error_msg) raise ValueError(error_msg) else: logger.info(f"Using {len(downloaded_voices)} locally cached voice files (OFFLINE mode)") return downloaded_voices def download_single_voice(voice_file: str) -> Tuple[str, bool, str]: """Download a single voice file with retry logic""" retry_count = 3 retry_delay = 2 for attempt in range(retry_count): try: # Download with exponential backoff if attempt > 0: delay = retry_delay * (2 ** (attempt - 1)) time.sleep(delay) # Download directly to voices directory import tempfile temp_dir = tempfile.mkdtemp() try: downloaded_path = hf_hub_download( repo_id="hexgrad/Kokoro-82M", filename=f"voices/{voice_file}", local_dir=temp_dir, force_download=False, revision=repo_version, local_files_only=OFFLINE_MODE ) # Verify file integrity with basic size check if Path(downloaded_path).stat().st_size == 0: raise ValueError(f"Downloaded file {voice_file} has zero size") # Move to final location voice_path = voices_dir / voice_file shutil.move(downloaded_path, str(voice_path)) return voice_file, True, f"Successfully downloaded {voice_file}" finally: # Clean up temporary directory try: shutil.rmtree(temp_dir) except: pass except Exception as e: error_msg = f"Failed to download {voice_file} (attempt {attempt+1}/{retry_count}): {e}" if attempt == retry_count - 1: return voice_file, False, error_msg logger.warning(error_msg) return voice_file, False, f"Failed all {retry_count} attempts to download {voice_file}" # Download files with progress bar and parallel processing if files_to_download: logger.info(f"Downloading {len(files_to_download)} missing voice files...") with ThreadPoolExecutor(max_workers=3) as executor: # Limit concurrent downloads # Submit all download tasks future_to_voice = { executor.submit(download_single_voice, voice_file): voice_file for voice_file in files_to_download } # Process completed downloads with progress bar with tqdm(total=len(files_to_download), desc="Downloading voices") as pbar: for future in as_completed(future_to_voice): voice_file, success, message = future.result() if success: downloaded_voices.append(voice_file) logger.info(message) else: failed_voices.append(voice_file) logger.error(message) pbar.update(1) # Report results if failed_voices: logger.warning(f"Failed to download {len(failed_voices)} voice files: {', '.join(failed_voices)}") if not downloaded_voices: error_msg = "No voice files could be downloaded. Please check your internet connection." logger.error(error_msg) raise ValueError(error_msg) elif len(downloaded_voices) < required_count: error_msg = f"Only {len(downloaded_voices)} voice files could be downloaded, but {required_count} were required." logger.error(error_msg) raise ValueError(error_msg) else: logger.info(f"Successfully processed {len(downloaded_voices)} voice files") return downloaded_voices def build_model( model_path: Optional[str], device: str, repo_version: str = "main", lang_code: str = 'a' ) -> EnhancedKPipeline: """Build and return the Enhanced Kokoro pipeline with proper encoding configuration Args: model_path: Path to the model file or None to use default device: Device to use ('cuda' or 'cpu') repo_version: Version/tag of the repository to use (default: "main") lang_code: Language code for the model (default: 'a' for American English, 'z' for Chinese) Returns: Initialized EnhancedKPipeline instance """ global _pipeline, _pipeline_lock # Use a lock for thread safety with _pipeline_lock: # Don't reuse pipeline if language code is different # (each language may need different configuration) if _pipeline is not None and hasattr(_pipeline, 'lang_code') and _pipeline.lang_code == lang_code: _pipeline.device = device return _pipeline try: # Determine if this is a Chinese model is_chinese_model = lang_code == 'z' or (model_path and 'zh' in str(model_path).lower()) # Download model if it doesn't exist if model_path is None: model_path = 'kokoro-v1_1-zh.pth' if is_chinese_model else 'kokoro-v1_0.pth' model_path = os.path.abspath(model_path) if not os.path.exists(model_path): if OFFLINE_MODE: error_msg = f"Model file {model_path} not found and running in OFFLINE mode. Please download the model first with network connection." logger.error(error_msg) raise ValueError(error_msg) logger.info(f"Downloading model file {model_path}...") try: from huggingface_hub import hf_hub_download # Determine filename and repo for download filename = 'kokoro-v1_1-zh.pth' if is_chinese_model else 'kokoro-v1_0.pth' model_repo_id = "hexgrad/Kokoro-82M-v1.1-zh" if is_chinese_model else "hexgrad/Kokoro-82M" model_path = hf_hub_download( repo_id=model_repo_id, filename=filename, local_dir=".", force_download=False, revision=repo_version, local_files_only=OFFLINE_MODE ) logger.info(f"Model downloaded to {model_path}") except Exception as e: logger.error(f"Error downloading model: {e}") raise ValueError(f"Could not download model: {e}") from e # Download config if it doesn't exist config_path = os.path.abspath("config.json") if not os.path.exists(config_path): if OFFLINE_MODE: error_msg = f"Config file {config_path} not found and running in OFFLINE mode. Please download the config first with network connection." logger.error(error_msg) raise ValueError(error_msg) logger.info("Downloading config file...") try: from huggingface_hub import hf_hub_download config_path = hf_hub_download( repo_id="hexgrad/Kokoro-82M", filename="config.json", local_dir=".", force_download=False, revision=repo_version, local_files_only=OFFLINE_MODE ) logger.info(f"Config downloaded to {config_path}") except Exception as e: logger.error(f"Error downloading config: {e}") raise ValueError(f"Could not download config: {e}") from e # Initialize phonemizer for the appropriate language if is_chinese_model: logger.info("Initializing phonemizer for Chinese...") try: initialize_phonemizer('zh') except Exception as e: logger.warning(f"Could not initialize Chinese phonemizer: {e}") else: logger.info("Initializing phonemizer for English...") try: initialize_phonemizer('en-us') except Exception as e: logger.warning(f"Could not initialize English phonemizer: {e}") # Download voice files - require at least one voice try: downloaded_voices = download_voice_files(repo_version=repo_version, required_count=1) except ValueError as e: logger.error(f"Error: Voice files download failed: {e}") raise ValueError("Voice files download failed") from e # Validate language code supported_codes = list(LANGUAGE_CODES.keys()) if lang_code not in supported_codes: logger.warning(f"Unsupported language code '{lang_code}'. Using 'a' (American English).") logger.info(f"Supported language codes: {', '.join(supported_codes)}") lang_code = 'a' # Initialize pipeline with validated language code patch_applied_here = not _patches_applied['json_load'] if patch_applied_here: patch_json_load() try: pipeline_instance = EnhancedKPipeline(lang_code=lang_code) finally: if patch_applied_here: restore_json_load() if pipeline_instance is None: raise ValueError("Failed to initialize EnhancedKPipeline - pipeline is None") # Store language code and device pipeline_instance.lang_code = lang_code pipeline_instance.device = device # Try to load the first available voice with improved error handling voice_loaded = False matching_voice_files = [ voice_file for voice_file in downloaded_voices if get_language_code_from_voice(Path(voice_file).stem) == lang_code ] if not matching_voice_files: logger.warning( "No voice files matched language code '%s'; falling back to any downloaded voice", lang_code ) for voice_file in matching_voice_files or downloaded_voices: voice_path = os.path.abspath(os.path.join("voices", voice_file)) if os.path.exists(voice_path): try: pipeline_instance.load_voice(voice_path) logger.info(f"Successfully loaded voice: {voice_file}") voice_loaded = True break # Successfully loaded a voice except Exception as e: logger.warning(f"Warning: Failed to load voice {voice_file}: {e}") continue if not voice_loaded: logger.warning("Warning: Could not load any voice models") # Set the global _pipeline only after successful initialization _pipeline = pipeline_instance except Exception as e: logger.error(f"Error initializing pipeline: {e}") raise return _pipeline def list_available_voices() -> List[str]: """List all available voice models""" # Always use absolute path for consistency voices_dir = Path(os.path.abspath("voices")) # Create voices directory if it doesn't exist if not voices_dir.exists(): print(f"Creating voices directory at {voices_dir}") voices_dir.mkdir(exist_ok=True) return [] # Get all .pt files in the voices directory voice_files = list(voices_dir.glob("*.pt")) # If we found voice files, return them if voice_files: return [f.stem for f in sorted(voice_files, key=lambda f: f.stem.lower())] # If no voice files in standard location, check if we need to do a one-time migration # This is legacy support for older installations alt_voices_path = Path(".") / "voices" if alt_voices_path.exists() and alt_voices_path.is_dir() and alt_voices_path != voices_dir: print(f"Checking alternative voice location: {alt_voices_path.absolute()}") alt_voice_files = list(alt_voices_path.glob("*.pt")) if alt_voice_files: print(f"Found {len(alt_voice_files)} voice files in alternate location") print("Moving files to the standard voices directory...") # Process files in a batch for efficiency files_moved = 0 for voice_file in alt_voice_files: target_path = voices_dir / voice_file.name if not target_path.exists(): try: # Use copy2 to preserve metadata, then remove original if successful shutil.copy2(str(voice_file), str(target_path)) files_moved += 1 except (OSError, IOError) as e: print(f"Error copying {voice_file.name}: {e}") if files_moved > 0: print(f"Successfully moved {files_moved} voice files") return [f.stem for f in sorted(voices_dir.glob("*.pt"), key=lambda f: f.stem.lower())] print("No voice files found. Please run the application again to download voices.") return [] def get_language_code_from_voice(voice_name: str) -> str: """Get the appropriate language code from a voice name Args: voice_name: Name of the voice (e.g., 'af_bella', 'jf_alpha') Returns: Language code for the voice """ prefix = voice_name[:2].lower() if len(voice_name) >= 2 else 'af' return VOICE_PREFIX_TO_LANGUAGE_CODE.get(prefix, 'a') # Default to American English def load_voice(voice_name: str, device: str) -> torch.Tensor: """Load a voice model in a thread-safe manner Args: voice_name: Name of the voice to load (with or without .pt extension) device: Device to use ('cuda' or 'cpu') Returns: Loaded voice model tensor Raises: ValueError: If voice file not found or loading fails """ # Format voice path correctly - strip .pt if it was included voice_name = voice_name.replace('.pt', '') pipeline = build_model(None, device, lang_code=get_language_code_from_voice(voice_name)) voice_path = Path("voices").resolve() / f"{voice_name}.pt" if not voice_path.exists(): raise ValueError(f"Voice file not found: {voice_path}") # Use a lock to ensure thread safety when loading voices with _pipeline_lock: # Check if voice is already loaded if voice_name in pipeline.voices: return pipeline.voices[voice_name] # Load voice if not already loaded return pipeline.load_voice(str(voice_path)) def generate_speech( model: EnhancedKPipeline, text: str, voice: str, lang: str = 'a', device: str = 'cpu', speed: float = 1.0 ) -> Tuple[Optional[torch.Tensor], Optional[str]]: """Generate speech using the Kokoro pipeline in a thread-safe manner Args: model: EnhancedKPipeline instance text: Text to synthesize voice: Voice name (e.g. 'af_bella') lang: Language code ('a' for American English, 'b' for British English) device: Device to use ('cuda' or 'cpu') speed: Speech speed multiplier (default: 1.0) Returns: Tuple of (audio tensor, phonemes string) or (None, None) on error """ global _pipeline_lock try: if model is None: raise ValueError("Model is None - pipeline not properly initialized") # Format voice name and path voice_name = voice.replace('.pt', '') voice_path = Path("voices").resolve() / f"{voice_name}.pt" # Check if voice file exists if not voice_path.exists(): raise ValueError(f"Voice file not found: {voice_path}") # Thread-safe initialization of model properties and voice loading with _pipeline_lock: # Ensure device is set model.device = device # Ensure voice is loaded before generating if voice_name not in model.voices: logger.info(f"Loading voice {voice_name}...") try: model.load_voice(str(voice_path)) if voice_name not in model.voices: raise ValueError("Voice load succeeded but voice not in model.voices dictionary") except Exception as e: raise ValueError(f"Failed to load voice {voice_name}: {e}") # Generate speech (outside the lock for better concurrency) logger.info(f"Generating speech with device: {model.device}") generator = model( text, voice=str(voice_path), speed=speed, split_pattern=r'\n+' ) # Get first generated segment and convert numpy array to tensor if needed for gs, ps, audio in generator: if audio is not None: if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio).float() return audio, ps return None, None except (ValueError, FileNotFoundError, RuntimeError, KeyError, AttributeError, TypeError) as e: logger.error(f"Error generating speech: {e}") return None, None except Exception as e: logger.error(f"Unexpected error during speech generation: {e}") import traceback traceback.print_exc() return None, None ================================================ FILE: requirements.txt ================================================ kokoro # Official Kokoro TTS library (v1.0 model support) misaki # G2P library for Kokoro (multi-language support) torch # PyTorch for model inference (for GPU support, see README.md for CUDA-specific installation) torchaudio # PyTorch audio processing library soundfile # Audio file handling huggingface-hub # Model downloads from Hugging Face gradio # Web interface pydub # For audio format conversion espeakng-loader # For loading espeak-ng library phonemizer-fork # For phoneme generation wheel # For building packages setuptools # For installing packages maturin # Build dependency for underthesea-core num2words # For number to word conversion spacy # For text processing tqdm # Progress bars psutil # System and process monitoring packaging # Version parsing for dependency checking numpy<2.0 # Numerical computing underthesea # Japan Language Libraries fugashi[unidic] jaconv mojimoji pyopenjtalk # Korean Language Libraries jamo nltk # Mandarin Language Libraries cn2an jieba ordered-set pypinyin pypinyin-dict # Hebrew Language Libraries https://files.pythonhosted.org/packages/44/17/9efdef222f2fc8e1ca721d919738d69d8b2358554a99f27b0764905f60fd/mishkal_hebrew-0.3.2-py3-none-any.whl ================================================ FILE: setup_chinese_tts.py ================================================ """ Setup Script for Kokoro Chinese TTS =================================== This script downloads and sets up the Kokoro-v1.1-zh Chinese TTS model and all required voice files. Usage: python setup_chinese_tts.py """ import os import sys from pathlib import Path import logging from typing import List, Tuple # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration CHINESE_MODEL_FILE = "kokoro-v1_1-zh.pth" CONFIG_FILE = "config.json" VOICES_DIR = Path("voices").resolve() CHINESE_VOICES = [ # Female voices "zf_xiaobei.pt", "zf_xiaoni.pt", "zf_xiaoxiao.pt", "zf_xiaoyi.pt", # Male voices "zm_yunjian.pt", "zm_yunxi.pt", "zm_yunxia.pt", "zm_yunyang.pt" ] def print_header(): """Print setup header""" print("\n" + "="*60) print(" Kokoro-82M-v1.1 Chinese TTS Setup") print(" 科克罗中文TTS设置") print("="*60 + "\n") def check_dependencies() -> bool: """Check if required packages are installed""" print("检查依赖 (Checking dependencies)...") required_packages = { 'torch': 'PyTorch', 'huggingface_hub': 'Hugging Face Hub', 'kokoro': 'Kokoro', 'soundfile': 'SoundFile' } missing = [] for package, name in required_packages.items(): try: __import__(package) print(f" ✓ {name}") except ImportError: print(f" ✗ {name}") missing.append(package) if missing: print(f"\n缺少必需的包 (Missing packages): {', '.join(missing)}") print("请运行: pip install -r requirements.txt") return False print("✓ 所有依赖已安装 (All dependencies installed)\n") return True def download_file(repo_id: str, filename: str, local_dir: str = ".") -> bool: """Download a file from Hugging Face Hub Args: repo_id: Repository ID (e.g., "hexgrad/Kokoro-82M") filename: File to download local_dir: Local directory to save to Returns: True if successful, False otherwise """ try: from huggingface_hub import hf_hub_download print(f"下载 (Downloading): {filename}...") # Download the file downloaded_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=local_dir, force_download=False ) print(f" ✓ 完成 (Done): {filename}") return True except Exception as e: print(f" ✗ 错误 (Error): {e}") return False def download_model() -> bool: """Download the Chinese TTS model""" print("\n下载中文TTS模型 (Downloading Chinese TTS Model)...") print("-" * 60) model_path = Path(CHINESE_MODEL_FILE).resolve() # Check if already exists if model_path.exists(): size_mb = model_path.stat().st_size / (1024 * 1024) print(f"✓ 模型文件已存在 (Model already exists): {model_path}") print(f" 大小 (Size): {size_mb:.1f} MB") return True # Download success = download_file( "hexgrad/Kokoro-82M-v1.1-zh", CHINESE_MODEL_FILE, local_dir="." ) if success and model_path.exists(): size_mb = model_path.stat().st_size / (1024 * 1024) print(f"✓ 模型已下载 (Model downloaded): {size_mb:.1f} MB\n") return True else: print(f"✗ 模型下载失败 (Model download failed)\n") return False def download_config() -> bool: """Download the model configuration file""" print("下载配置文件 (Downloading Config File)...") print("-" * 60) config_path = Path(CONFIG_FILE).resolve() # Check if already exists if config_path.exists(): print(f"✓ 配置文件已存在 (Config already exists): {config_path}") return True # Download success = download_file( "hexgrad/Kokoro-82M", CONFIG_FILE, local_dir="." ) if success and config_path.exists(): print(f"✓ 配置文件已下载 (Config downloaded)\n") return True else: print(f"✗ 配置文件下载失败 (Config download failed)\n") return False def download_voices() -> Tuple[int, int]: """Download all Chinese voice files Returns: Tuple of (successful_downloads, failed_downloads) """ print("下载中文声音文件 (Downloading Chinese Voice Files)...") print("-" * 60) # Create voices directory VOICES_DIR.mkdir(parents=True, exist_ok=True) successful = 0 failed = 0 for voice_file in CHINESE_VOICES: voice_path = VOICES_DIR / voice_file # Check if already exists if voice_path.exists(): size_mb = voice_path.stat().st_size / (1024 * 1024) print(f"✓ {voice_file} ({size_mb:.1f} MB)") successful += 1 continue # Download try: from huggingface_hub import hf_hub_download print(f"下载 (Downloading): {voice_file}...") downloaded_path = hf_hub_download( repo_id="hexgrad/Kokoro-82M", filename=f"voices/{voice_file}", local_dir=str(VOICES_DIR.parent), force_download=False ) size_mb = Path(downloaded_path).stat().st_size / (1024 * 1024) print(f" ✓ 完成 (Done): {voice_file} ({size_mb:.1f} MB)") successful += 1 except Exception as e: print(f" ✗ 错误 (Error): {voice_file} - {e}") failed += 1 print(f"\n✓ 成功: {successful}/{len(CHINESE_VOICES)} (Successful: {successful}/{len(CHINESE_VOICES)})") if failed > 0: print(f"✗ 失败: {failed}/{len(CHINESE_VOICES)} (Failed: {failed}/{len(CHINESE_VOICES)})") print() return successful, failed def verify_setup() -> bool: """Verify that all required files are in place""" print("验证设置 (Verifying Setup)...") print("-" * 60) all_good = True # Check model model_path = Path(CHINESE_MODEL_FILE).resolve() if model_path.exists(): print(f"✓ 中文模型 (Chinese Model): {CHINESE_MODEL_FILE}") else: print(f"✗ 缺少模型 (Missing Model): {CHINESE_MODEL_FILE}") all_good = False # Check config config_path = Path(CONFIG_FILE).resolve() if config_path.exists(): print(f"✓ 配置文件 (Config File): {CONFIG_FILE}") else: print(f"✗ 缺少配置 (Missing Config): {CONFIG_FILE}") all_good = False # Check voices print(f"\n中文声音文件 (Chinese Voice Files):") voice_count = 0 for voice_file in CHINESE_VOICES: voice_path = VOICES_DIR / voice_file if voice_path.exists(): print(f" ✓ {voice_file}") voice_count += 1 else: print(f" ✗ {voice_file}") all_good = False print(f"\n✓ 已找到 {voice_count}/{len(CHINESE_VOICES)} 个声音文件") print(f"(Found {voice_count}/{len(CHINESE_VOICES)} voice files)\n") return all_good def print_summary(success: bool, model_ok: bool, config_ok: bool, voices_count: int): """Print setup summary""" print("="*60) print(" 设置摘要 (Setup Summary)") print("="*60) if success: print("\n✓ 设置完成!(Setup Complete!)") print("\n下一步 (Next Steps):") print("1. 运行演示: python chinese_tts_demo.py") print(" (Run demo: python chinese_tts_demo.py)") else: print("\n⚠ 设置未完成 (Setup Incomplete)") print("\n缺少的文件 (Missing Files):") if not model_ok: print(f" - {CHINESE_MODEL_FILE}") if not config_ok: print(f" - {CONFIG_FILE}") if voices_count < len(CHINESE_VOICES): print(f" - 声音文件 ({voices_count}/{len(CHINESE_VOICES)}) (Voice files)") print("\n"+"="*60 + "\n") def main(): """Main setup function""" print_header() # Check dependencies if not check_dependencies(): print("请先安装依赖 (Please install dependencies first)") return False # Download files model_ok = download_model() config_ok = download_config() voice_success, voice_failed = download_voices() # Verify setup print() setup_ok = verify_setup() # Summary print_summary( setup_ok, model_ok, config_ok, voice_success ) return setup_ok if __name__ == "__main__": try: success = main() sys.exit(0 if success else 1) except KeyboardInterrupt: print("\n\n设置被用户中止 (Setup interrupted by user)") sys.exit(1) except Exception as e: logger.error(f"设置错误 (Setup error): {e}") import traceback traceback.print_exc() sys.exit(1) ================================================ FILE: speed_dial.py ================================================ """ Speed Dial Module for Kokoro-TTS-Local -------------------------------------- Manages speed dial presets for quick access to frequently used voice and text combinations. This module provides functions to: - Load speed dial presets from a JSON file - Save new presets to the JSON file - Delete presets from the JSON file - Validate preset data """ import json import os from pathlib import Path from typing import Dict, List, Optional, Any # Define the path for the speed dial presets file SPEED_DIAL_FILE = Path("speed_dial.json") def load_presets() -> Dict[str, Dict[str, Any]]: """ Load speed dial presets from the JSON file. Returns: Dictionary of presets where keys are preset names and values are preset data """ if not SPEED_DIAL_FILE.exists(): # If file doesn't exist, return an empty dictionary return {} try: with open(SPEED_DIAL_FILE, 'r', encoding='utf-8') as f: presets = json.load(f) if not isinstance(presets, dict): print( "Error loading speed dial presets: " f"expected a JSON object, got {type(presets).__name__}" ) return {} # Validate the loaded presets validated_presets = {} for name, preset in presets.items(): if not isinstance(name, str) or not isinstance(preset, dict): print(f"Skipping invalid preset entry: {name!r}") continue if validate_preset(preset): validated_presets[name] = preset return validated_presets except (json.JSONDecodeError, IOError) as e: print(f"Error loading speed dial presets: {e}") return {} def save_preset(name: str, voice: str, text: str, format: str = "wav", speed: float = 1.0) -> bool: """ Save a new speed dial preset. Args: name: Name of the preset voice: Voice to use text: Text to convert to speech format: Output format (default: "wav") speed: Speech speed (default: 1.0) Returns: True if successful, False otherwise """ import re # Validate preset name if not isinstance(name, str) or len(name.strip()) == 0: print("Preset name must be a non-empty string") return False if len(name) > 50: print("Preset name is too long (max 50 characters)") return False # Only allow safe characters in preset names if not re.match(r'^[a-zA-Z0-9_\- ]+$', name): print("Preset name contains invalid characters") return False # Create preset data preset = { "voice": voice, "text": text, "format": format, "speed": speed } # Validate preset data if not validate_preset(preset): return False # Load existing presets presets = load_presets() # Add or update the preset presets[name] = preset # Save presets to file try: with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f: json.dump(presets, f, indent=2, ensure_ascii=False) return True except IOError as e: print(f"Error saving speed dial preset: {e}") return False def delete_preset(name: str) -> bool: """ Delete a speed dial preset. Args: name: Name of the preset to delete Returns: True if successful, False otherwise """ # Load existing presets presets = load_presets() # Check if preset exists if name not in presets: return False # Remove the preset del presets[name] # Save presets to file try: with open(SPEED_DIAL_FILE, 'w', encoding='utf-8') as f: json.dump(presets, f, indent=2, ensure_ascii=False) return True except IOError as e: print(f"Error deleting speed dial preset: {e}") return False def validate_preset(preset: Dict[str, Any]) -> bool: """ Validate a preset's data structure with security checks. Args: preset: Preset data to validate Returns: True if valid, False otherwise """ import re # Check required fields required_fields = ["voice", "text"] for field in required_fields: if field not in preset: print(f"Preset missing required field: {field}") return False # Check field types and validate content voice = preset.get("voice") if not isinstance(voice, str): print("Preset voice must be a string") return False # Validate voice name (alphanumeric, underscore, dash only) if not re.match(r'^[a-zA-Z0-9_-]+$', voice): print("Preset voice contains invalid characters") return False text = preset.get("text") if not isinstance(text, str): print("Preset text must be a string") return False # Validate text length and content if len(text) > 10000: print("Preset text is too long (max 10,000 characters)") return False if len(text.strip()) == 0: print("Preset text cannot be empty") return False # Optional fields with validation if "format" not in preset: preset["format"] = "wav" else: format_val = preset["format"] if not isinstance(format_val, str): print("Preset format must be a string") return False # Only allow safe audio formats if format_val not in ["wav", "mp3", "aac"]: print("Preset format must be wav, mp3, or aac") return False if "speed" not in preset: preset["speed"] = 1.0 else: speed = preset["speed"] if not isinstance(speed, (int, float)): print("Preset speed must be a number") return False # Validate speed range if speed < 0.1 or speed > 3.0: print("Preset speed must be between 0.1 and 3.0") return False return True def get_preset_names() -> List[str]: """ Get a list of all preset names. Returns: List of preset names """ presets = load_presets() return list(presets.keys()) def get_preset(name: str) -> Optional[Dict[str, Any]]: """ Get a specific preset by name. Args: name: Name of the preset to get Returns: Preset data or None if not found """ presets = load_presets() return presets.get(name) ================================================ FILE: test_offline.py ================================================ #!/usr/bin/env python3 """ Test script for verifying offline mode functionality of Kokoro-TTS-Local """ import os import sys from pathlib import Path import torch # Constants REQUIRED_FILES = { 'model': 'kokoro-v1_0.pth', 'config': 'config.json', 'voices_dir': 'voices' } DEFAULT_TEST_TEXT = "Hello, this is a test of offline mode." DEFAULT_VOICE = "af_bella" TEST_OUTPUT = "test_offline_output.wav" def print_header(text: str): """Print a formatted header""" print("\n" + "=" * 60) print(f" {text}") print("=" * 60) def print_status(item: str, status: bool, details: str = ""): """Print a status line with check or cross mark""" mark = "[PASS]" if status else "[FAIL]" print(f" {mark} {item}") if details: print(f" {details}") def check_offline_mode() -> bool: """Check if offline mode is enabled""" print_header("Checking Offline Mode Configuration") hf_offline = os.environ.get("HF_HUB_OFFLINE", "0") transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE", "0") offline_enabled = hf_offline == "1" or transformers_offline == "1" print_status("HF_HUB_OFFLINE", hf_offline == "1", f"Value: {hf_offline}") print_status("TRANSFORMERS_OFFLINE", transformers_offline == "1", f"Value: {transformers_offline}") print_status("Offline Mode Status", offline_enabled, "Enabled" if offline_enabled else "Not enabled - will attempt network access") return offline_enabled def check_required_files() -> dict: """Check if all required files exist""" print_header("Checking Required Files") results = {} # Check model file model_path = Path(REQUIRED_FILES['model']).resolve() model_exists = model_path.exists() results['model'] = model_exists print_status("Model file", model_exists, str(model_path)) # Check config file config_path = Path(REQUIRED_FILES['config']).resolve() config_exists = config_path.exists() results['config'] = config_exists print_status("Config file", config_exists, str(config_path)) # Check voices directory voices_dir = Path(REQUIRED_FILES['voices_dir']).resolve() voices_exists = voices_dir.exists() and voices_dir.is_dir() results['voices_dir'] = voices_exists print_status("Voices directory", voices_exists, str(voices_dir)) # Check for voice files if voices_exists: voice_files = list(voices_dir.glob("*.pt")) results['voice_count'] = len(voice_files) has_voices = len(voice_files) > 0 print_status("Voice files", has_voices, f"Found {len(voice_files)} voice file(s)") # Check for default voice default_voice_path = voices_dir / f"{DEFAULT_VOICE}.pt" default_voice_exists = default_voice_path.exists() results['default_voice'] = default_voice_exists print_status(f"Default voice ({DEFAULT_VOICE})", default_voice_exists, str(default_voice_path) if default_voice_exists else "Not found") else: results['voice_count'] = 0 results['default_voice'] = False print_status("Voice files", False, "Voices directory not found") return results def check_dependencies() -> dict: """Check if required Python packages are installed""" print_header("Checking Dependencies") results = {} required_packages = { 'torch': 'PyTorch', 'kokoro': 'Kokoro TTS', 'soundfile': 'SoundFile', 'numpy': 'NumPy', 'tqdm': 'tqdm' } for package, name in required_packages.items(): try: __import__(package) results[package] = True print_status(name, True, f"Package '{package}' installed") except ImportError: results[package] = False print_status(name, False, f"Package '{package}' not found") # Check CUDA availability cuda_available = torch.cuda.is_available() results['cuda'] = cuda_available print_status("CUDA Support", cuda_available, "GPU acceleration available" if cuda_available else "Using CPU") return results def test_model_initialization() -> bool: """Test if model can be initialized""" print_header("Testing Model Initialization") try: from models import build_model device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f" Using device: {device}") model_path = Path(REQUIRED_FILES['model']).resolve() print(f" Model path: {model_path}") # Build model print(" Initializing model...") model = build_model(str(model_path), device) if model is None: print_status("Model initialization", False, "Model returned None") return False print_status("Model initialization", True, "Model loaded successfully") return True except Exception as e: print_status("Model initialization", False, f"Error: {type(e).__name__}: {str(e)}") return False def test_voice_listing() -> bool: """Test if voices can be listed""" print_header("Testing Voice Listing") try: from models import list_available_voices voices = list_available_voices() if not voices: print_status("Voice listing", False, "No voices found") return False print_status("Voice listing", True, f"Found {len(voices)} voice(s)") print("\n Available voices:") for i, voice in enumerate(voices[:10], 1): # Show first 10 print(f" {i}. {voice}") if len(voices) > 10: print(f" ... and {len(voices) - 10} more") return True except Exception as e: print_status("Voice listing", False, f"Error: {type(e).__name__}: {str(e)}") return False def test_speech_generation() -> bool: """Test if speech can be generated""" print_header("Testing Speech Generation") try: from models import build_model, list_available_voices import soundfile as sf import numpy as np # Get available voices voices = list_available_voices() if not voices: print_status("Speech generation", False, "No voices available") return False # Use default voice if available, otherwise use first voice voice = DEFAULT_VOICE if DEFAULT_VOICE in voices else voices[0] print(f" Using voice: {voice}") print(f" Test text: '{DEFAULT_TEST_TEXT}'") # Initialize model device = 'cuda' if torch.cuda.is_available() else 'cpu' model = build_model(str(Path(REQUIRED_FILES['model']).resolve()), device) if model is None: print_status("Speech generation", False, "Failed to load model") return False # Generate speech print(" Generating speech...") voice_path = Path("voices").resolve() / f"{voice}.pt" all_audio = [] generator = model(DEFAULT_TEST_TEXT, voice=str(voice_path), speed=1.0, split_pattern=r'\n+') for gs, ps, audio in generator: if audio is not None: audio_tensor = audio if isinstance(audio, torch.Tensor) else torch.from_numpy(audio).float() all_audio.append(audio_tensor) if not all_audio: print_status("Speech generation", False, "No audio generated") return False # Concatenate audio segments if len(all_audio) == 1: final_audio = all_audio[0] else: final_audio = torch.cat(all_audio, dim=0) # Save test output output_path = Path(TEST_OUTPUT).resolve() sf.write(str(output_path), final_audio.numpy(), 24000) if not output_path.exists(): print_status("Speech generation", False, "Failed to save output file") return False file_size = output_path.stat().st_size print_status("Speech generation", True, f"Generated {file_size:,} bytes to {output_path.name}") return True except Exception as e: print_status("Speech generation", False, f"Error: {type(e).__name__}: {str(e)}") import traceback traceback.print_exc() return False def cleanup(): """Clean up test files""" try: output_path = Path(TEST_OUTPUT) if output_path.exists(): output_path.unlink() print(f"\n Cleaned up test file: {TEST_OUTPUT}") except Exception as e: print(f"\n Warning: Could not clean up test file: {e}") def main(): """Run all offline mode tests""" print("\n" + "=" * 60) print(" KOKORO TTS OFFLINE MODE TEST") print("=" * 60) # Track test results tests_passed = 0 tests_failed = 0 # Check offline mode offline_enabled = check_offline_mode() if not offline_enabled: print("\n[WARNING] Offline mode is not enabled!") print(" To enable offline mode, set the environment variable:") print(" Linux/macOS: export HF_HUB_OFFLINE=1") print(" Windows PS: $env:HF_HUB_OFFLINE=\"1\"") print(" Windows CMD: set HF_HUB_OFFLINE=1") print("\n Continuing tests (may require network access)...\n") # Check required files file_results = check_required_files() all_files_present = all([ file_results.get('model', False), file_results.get('config', False), file_results.get('voices_dir', False), file_results.get('voice_count', 0) > 0 ]) if not all_files_present: print("\n[PREREQUISITE FAILED] Required files are missing") print(" Please run the application with network access first to download:") print(" - Model file (kokoro-v1_0.pth)") print(" - Config file (config.json)") print(" - At least one voice file in voices/ directory") print("\n Run: python tts_demo.py") print(" or: python gradio_interface.py") return 1 # Check dependencies dep_results = check_dependencies() all_deps_present = all([ dep_results.get('torch', False), dep_results.get('kokoro', False), dep_results.get('soundfile', False), dep_results.get('numpy', False), dep_results.get('tqdm', False) ]) if not all_deps_present: print("\n[PREREQUISITE FAILED] Required dependencies are missing") print(" Please install required packages:") print(" pip install -r requirements.txt") return 1 # Test model initialization if test_model_initialization(): tests_passed += 1 else: tests_failed += 1 # Test voice listing if test_voice_listing(): tests_passed += 1 else: tests_failed += 1 # Test speech generation if test_speech_generation(): tests_passed += 1 else: tests_failed += 1 # Print summary print_header("Test Summary") total_tests = tests_passed + tests_failed print(f" Total tests: {total_tests}") print(f" Passed: {tests_passed}") print(f" Failed: {tests_failed}") if tests_failed == 0: print("\n[SUCCESS] All tests passed!") print(" Your offline setup is working correctly.") cleanup() return 0 else: print(f"\n[FAILURE] {tests_failed} test(s) failed") print(" Please review the errors above and fix any issues.") return 1 if __name__ == "__main__": try: exit_code = main() sys.exit(exit_code) except KeyboardInterrupt: print("\n\nTest interrupted by user") cleanup() sys.exit(130) except Exception as e: print(f"\n\n[UNEXPECTED ERROR] {type(e).__name__}: {str(e)}") import traceback traceback.print_exc() cleanup() sys.exit(1) ================================================ FILE: tts_demo.py ================================================ import torch from typing import Optional, Tuple, List, Union from models import build_model, generate_speech, list_available_voices from tqdm.auto import tqdm import soundfile as sf from pathlib import Path import numpy as np import time import os import sys # Define path type for consistent handling PathLike = Union[str, Path] # Constants MAX_TEXT_LENGTH = 10000 MAX_GENERATION_TIME = 300 # seconds MIN_GENERATION_TIME = 60 # seconds DEFAULT_SAMPLE_RATE = 24000 MIN_SPEED = 0.1 MAX_SPEED = 3.0 DEFAULT_SPEED = 1.0 MAX_RETRIES = 3 RETRY_DELAY = 2 # seconds # Constants with validation def validate_sample_rate(rate: int) -> int: """Validate sample rate is within acceptable range""" valid_rates = [16000, 22050, 24000, 44100, 48000] if rate not in valid_rates: print(f"Warning: Unusual sample rate {rate}. Valid rates are {valid_rates}") return 24000 # Default to safe value return rate def validate_language(lang: str) -> str: """Validate language code""" # Import here to avoid circular imports from models import LANGUAGE_CODES valid_langs = list(LANGUAGE_CODES.keys()) if lang not in valid_langs: print(f"Warning: Invalid language code '{lang}'. Using 'a' (American English).") print(f"Supported language codes: {', '.join(valid_langs)}") return 'a' # Default to American English return lang # Define and validate constants SAMPLE_RATE = validate_sample_rate(24000) DEFAULT_MODEL_PATH = Path('kokoro-v1_0.pth').resolve() DEFAULT_OUTPUT_FILE = Path('output.wav').resolve() DEFAULT_LANGUAGE = validate_language('a') # 'a' for American English, 'b' for British English DEFAULT_TEXT = "Hello, welcome to this text-to-speech test." # Ensure output directory exists DEFAULT_OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) # Configure tqdm for better Windows console support tqdm.monitor_interval = 0 def print_menu(): """Print the main menu options.""" print("\n=== Kokoro TTS Menu ===") print("1. List available voices") print("2. Generate speech") print("3. Exit") return input("Select an option (1-3): ").strip() def select_voice(voices: List[str]) -> str: """Interactive voice selection.""" print("\nAvailable voices:") for i, voice in enumerate(voices, 1): print(f"{i}. {voice}") while True: try: choice = input("\nSelect a voice number (or press Enter for default 'af_bella'): ").strip() if not choice: return "af_bella" choice = int(choice) if 1 <= choice <= len(voices): return voices[choice - 1] print("Invalid choice. Please try again.") except ValueError: print("Please enter a valid number.") def get_text_input() -> str: """Get text input from user.""" print("\nEnter the text you want to convert to speech") print("(or press Enter for default text)") text = input("> ").strip() return text if text else DEFAULT_TEXT def get_speed() -> float: """Get speech speed from user.""" while True: try: speed = input(f"\nEnter speech speed ({MIN_SPEED}-{MAX_SPEED}, default {DEFAULT_SPEED}): ").strip() if not speed: return DEFAULT_SPEED speed = float(speed) if MIN_SPEED <= speed <= MAX_SPEED: return speed print(f"Speed must be between {MIN_SPEED} and {MAX_SPEED}") except ValueError: print("Please enter a valid number.") def save_audio_with_retry(audio_data: np.ndarray, sample_rate: int, output_path: PathLike, max_retries: int = MAX_RETRIES, retry_delay: float = RETRY_DELAY) -> bool: """ Attempt to save audio data to file with retry logic. Args: audio_data: Audio data as numpy array sample_rate: Sample rate in Hz output_path: Path to save the audio file max_retries: Maximum number of retry attempts retry_delay: Delay between retries in seconds Returns: True if successful, False otherwise """ # Convert and normalize path to Path object output_path = Path(output_path).resolve() # Create parent directory if it doesn't exist output_path.parent.mkdir(parents=True, exist_ok=True) # Try to remove the file if it exists to avoid "file in use" issues try: if output_path.exists(): print(f"Removing existing file: {output_path}") output_path.unlink() except Exception as e: print(f"Warning: Could not remove existing file: {e}") print("This might indicate the file is in use by another program.") for attempt in range(max_retries): try: # Validate audio data before saving if audio_data is None or len(audio_data) == 0: raise ValueError("Empty audio data") # Check write permissions for the directory if not os.access(str(output_path.parent), os.W_OK): raise PermissionError(f"No write permission for directory: {output_path.parent}") # Try to use a temporary file first, then rename it temp_path = output_path.with_name(f"temp_{output_path.name}") # Save audio file to temporary location print(f"Saving audio to temporary file: {temp_path}") sf.write(str(temp_path), audio_data, sample_rate) # If successful, rename to final location if temp_path.exists(): # Remove target file if it exists if output_path.exists(): output_path.unlink() # Rename temp file to target file temp_path.rename(output_path) print(f"Successfully renamed temporary file to: {output_path}") return True except (IOError, PermissionError) as e: if attempt < max_retries - 1: print(f"\nFailed to save audio (attempt {attempt + 1}/{max_retries}): {e}") print("The output file might be in use by another program (e.g., media player).") print(f"Please close any programs that might be using '{output_path}'") print(f"Retrying in {retry_delay} seconds...") time.sleep(retry_delay) else: print(f"\nError: Could not save audio after {max_retries} attempts: {e}") print(f"Please ensure '{output_path}' is not open in any other program and try again.") print(f"You might need to restart your computer if the file remains locked.") return False except Exception as e: print(f"\nUnexpected error saving audio: {type(e).__name__}: {e}") if attempt < max_retries - 1: print(f"Retrying in {retry_delay} seconds...") time.sleep(retry_delay) else: return False finally: # Clean up temp file if it exists and we failed try: temp_path = output_path.with_name(f"temp_{output_path.name}") if temp_path.exists(): temp_path.unlink() except Exception as e: print(f"Warning: Could not clean up temporary file {temp_path}: {e}") return False def main() -> None: import psutil import gc try: # Check system memory at startup memory = psutil.virtual_memory() available_gb = memory.available / (1024**3) total_gb = memory.total / (1024**3) print(f"System memory: {available_gb:.1f}GB available / {total_gb:.1f}GB total") if available_gb < 2.0: print("Warning: Low system memory detected. Consider closing other applications.") # Force garbage collection gc.collect() # Set up device safely try: device = 'cuda' if torch.cuda.is_available() else 'cpu' except (RuntimeError, AttributeError, ImportError) as e: print(f"CUDA initialization error: {e}. Using CPU instead.") device = 'cpu' # Fallback if CUDA check fails print(f"Using device: {device}") # Build model print("\nInitializing model...") with tqdm(total=1, desc="Building model") as pbar: model = build_model(DEFAULT_MODEL_PATH, device) pbar.update(1) # Cache for voices to avoid redundant calls voices_cache = None while True: choice = print_menu() if choice == "1": # List voices voices_cache = list_available_voices() print("\nAvailable voices:") for voice in voices_cache: print(f"- {voice}") elif choice == "2": # Generate speech # Use cached voices if available if voices_cache is None: voices_cache = list_available_voices() if not voices_cache: print("No voices found! Please check the voices directory.") continue # Get user inputs voice = select_voice(voices_cache) text = get_text_input() # Dynamic text length validation based on available memory memory = psutil.virtual_memory() available_gb = memory.available / (1024**3) # Adjust max length based on available memory dynamic_max_length = MAX_TEXT_LENGTH if available_gb < 2.0: dynamic_max_length = min(MAX_TEXT_LENGTH, 3000) print(f"Reduced text limit to {dynamic_max_length} characters due to low memory") if len(text) > dynamic_max_length: print(f"Text is too long ({len(text)} chars). Maximum allowed: {dynamic_max_length} characters.") print("Please enter a shorter text.") continue speed = get_speed() print(f"\nGenerating speech for: '{text}'") print(f"Using voice: {voice}") print(f"Speed: {speed}x") # Generate speech all_audio = [] # Use Path object for consistent path handling voice_path = Path("voices").resolve() / f"{voice}.pt" # Verify voice file exists if not voice_path.exists(): print(f"Error: Voice file not found: {voice_path}") continue # Set a timeout for generation with per-segment timeout max_gen_time = MAX_GENERATION_TIME max_segment_time = MIN_GENERATION_TIME start_time = time.time() segment_start_time = start_time try: # Setup watchdog timer for overall process import threading generation_complete = False def watchdog_timer(): if not generation_complete: print("\nWatchdog: Generation taking too long, process will be cancelled") # Can't directly interrupt generator, but this will inform user # Start watchdog timer watchdog = threading.Timer(max_gen_time, watchdog_timer) watchdog.daemon = True # Don't prevent program exit watchdog.start() # Initialize generator try: generator = model(text, voice=str(voice_path), speed=speed, split_pattern=r'\n+') except (ValueError, TypeError, RuntimeError) as e: print(f"Error initializing speech generator: {e}") watchdog.cancel() continue except Exception as e: print(f"Unexpected error initializing generator: {type(e).__name__}: {e}") watchdog.cancel() continue # Process segments with tqdm(desc="Generating speech") as pbar: for gs, ps, audio in generator: # Check overall timeout current_time = time.time() if current_time - start_time > max_gen_time: print("\nWarning: Total generation time exceeded limit, stopping") break # Check per-segment timeout segment_elapsed = current_time - segment_start_time if segment_elapsed > max_segment_time: print(f"\nWarning: Segment took too long ({segment_elapsed:.1f}s), stopping") break # Reset segment timer segment_start_time = current_time # Process audio if available if audio is not None: # Only convert if it's a numpy array, not if already tensor audio_tensor = audio if isinstance(audio, torch.Tensor) else torch.from_numpy(audio).float() all_audio.append(audio_tensor) print(f"\nGenerated segment: {gs}") if ps: # Only print phonemes if available print(f"Phonemes: {ps}") pbar.update(1) # Mark generation as complete (for watchdog) generation_complete = True watchdog.cancel() except ValueError as e: print(f"Value error during speech generation: {e}") except RuntimeError as e: print(f"Runtime error during speech generation: {e}") # If CUDA out of memory, provide more helpful message if "CUDA out of memory" in str(e): print("CUDA out of memory error - try using a shorter text or switching to CPU") except KeyError as e: print(f"Key error during speech generation: {e}") print("This might be caused by a missing voice configuration") except FileNotFoundError as e: print(f"File not found: {e}") except Exception as e: print(f"Unexpected error during speech generation: {type(e).__name__}: {e}") import traceback traceback.print_exc() # Save audio if all_audio: try: # Handle single segment case without concatenation if len(all_audio) == 1: final_audio = all_audio[0] else: try: final_audio = torch.cat(all_audio, dim=0) except RuntimeError as e: print(f"Error concatenating audio segments: {e}") continue # Use consistent Path object output_path = DEFAULT_OUTPUT_FILE if isinstance(final_audio, torch.Tensor): final_audio = final_audio.detach().cpu().numpy() if save_audio_with_retry(final_audio, SAMPLE_RATE, output_path): print(f"\nAudio saved to {output_path}") # Play a system beep to indicate completion try: print('\a') # ASCII bell - should make a sound on most systems except: pass else: print("Failed to save audio file") except Exception as e: print(f"Error processing audio: {type(e).__name__}: {e}") else: print("Error: Failed to generate audio") elif choice == "3": print("\nGoodbye!") break else: print("\nInvalid choice. Please try again.") except Exception as e: print(f"Error in main: {e}") import traceback traceback.print_exc() finally: # Comprehensive cleanup with error handling try: print("\nPerforming cleanup...") # Ensure model is properly released if 'model' in locals() and model is not None: print("Cleaning up model resources...") # First clear any references to voice models if hasattr(model, 'voices'): try: voices_count = len(model.voices) model.voices.clear() print(f"Cleared {voices_count} voice references") except Exception as voice_error: print(f"Error clearing voice references: {voice_error}") # Clear any other model attributes that might hold references try: for attr in list(model.__dict__.keys()): if hasattr(model, attr) and not attr.startswith('__'): try: delattr(model, attr) except: pass except Exception as attr_error: print(f"Error clearing model attributes: {attr_error}") # Then delete the model try: del model model = None print("Model reference deleted") except Exception as del_error: print(f"Error deleting model: {del_error}") # Clean up voice cache if 'voices_cache' in locals() and voices_cache is not None: try: voices_cache.clear() voices_cache = None print("Voice cache cleared") except Exception as cache_error: print(f"Error clearing voice cache: {cache_error}") # Clean up any CUDA resources if torch.cuda.is_available(): try: print("Cleaning up CUDA resources...") torch.cuda.empty_cache() print("CUDA cache emptied") except Exception as cuda_error: print(f"Error clearing CUDA cache: {cuda_error}") # Final garbage collection try: import gc gc.collect() print("Garbage collection completed") except Exception as gc_error: print(f"Error during garbage collection: {gc_error}") print("Cleanup completed") except Exception as e: print(f"Error during cleanup: {type(e).__name__}: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()