Full Code of zhao-kun/VibeVoiceFusion for AI

main b3766532d8b0 cached
202 files
10.3 MB
2.7M tokens
1324 symbols
1 requests
Copy disabled (too large) Download .txt
Showing preview only (12,063K chars total). Download the full file to get everything.
Repository: zhao-kun/VibeVoiceFusion
Branch: main
Commit: b3766532d8b0
Files: 202
Total size: 10.3 MB

Directory structure:
gitextract__vtvi0gi/

├── .dockerignore
├── .gitignore
├── CHANGELOG.md
├── CHANGELOG_zh.md
├── Dockerfile
├── README.md
├── README_zh.md
├── backend/
│   ├── .gitignore
│   ├── README.md
│   ├── __init__.py
│   ├── api/
│   │   ├── __init__.py
│   │   ├── datasets.py
│   │   ├── dialog_sessions.py
│   │   ├── generation.py
│   │   ├── openai_compat.py
│   │   ├── preset_voices.py
│   │   ├── projects.py
│   │   ├── quick_generate.py
│   │   ├── speakers.py
│   │   ├── tasks.py
│   │   └── training.py
│   ├── app.py
│   ├── config.py
│   ├── i18n/
│   │   ├── __init__.py
│   │   ├── en.json
│   │   └── zh.json
│   ├── inference/
│   │   ├── inference.py
│   │   └── quick_generate_inference.py
│   ├── run.py
│   ├── scripts/
│   │   ├── generate_cantonese_training_dataset.py
│   │   ├── generate_mcv_cantonese_training_dataset.py
│   │   ├── generate_training_dataset.py
│   │   └── migrate_dataset_paths.py
│   ├── services/
│   │   ├── __init__.py
│   │   ├── dataset_service.py
│   │   ├── dialog_session_service.py
│   │   ├── openai_compat_service.py
│   │   ├── preset_voice_service.py
│   │   ├── project_service.py
│   │   ├── quick_generate_service.py
│   │   ├── speaker_service.py
│   │   ├── training_service.py
│   │   └── voice_gerneration_service.py
│   ├── task_manager/
│   │   ├── inference_task.py
│   │   ├── quick_generate_task.py
│   │   ├── task.py
│   │   └── training_task.py
│   ├── training/
│   │   ├── engine.py
│   │   └── state.py
│   └── utils/
│       ├── __init__.py
│       ├── dialog_validator.py
│       ├── file_handler.py
│       └── tensorboard_reader.py
├── compose.yml
├── config/
│   ├── __init__.py
│   └── configuration_vibevoice.py
├── demo/
│   ├── README_AUDIO_DENOISE.md
│   ├── audio_denoise_deepfilter.py
│   ├── audio_denose.py
│   ├── convert_model.py
│   ├── list_modules.py
│   ├── local_file_inference.py
│   ├── train.py
│   ├── verify_dataset.py
│   ├── view_tensorfile.py
│   └── vram_offload_animation.py
├── docs/
│   ├── APIs.md
│   ├── DATASET_PATH_FIX.md
│   ├── DOCKER_REBUILD.md
│   ├── develop_thoughts.md
│   ├── model_components_analysis.md
│   ├── multi-generation-ui-design.md
│   ├── offloading.md
│   ├── openai-compatible-api.md
│   ├── preset-voice-feature.md
│   ├── processor.md
│   ├── quick-generate-feature.md
│   └── vibevoice_inference_architecture.md
├── frontend/
│   ├── .gitignore
│   ├── README.md
│   ├── app/
│   │   ├── dataset/
│   │   │   ├── detail/
│   │   │   │   └── page.tsx
│   │   │   └── page.tsx
│   │   ├── fine-tuning/
│   │   │   └── page.tsx
│   │   ├── generate-voice/
│   │   │   └── page.tsx
│   │   ├── globals.css
│   │   ├── layout.tsx
│   │   ├── page.tsx
│   │   ├── quick-generate/
│   │   │   └── page.tsx
│   │   ├── speaker-role/
│   │   │   └── page.tsx
│   │   └── voice-editor/
│   │       └── page.tsx
│   ├── components/
│   │   ├── AudioPlayer.tsx
│   │   ├── AudioUploader.tsx
│   │   ├── CreateDatasetModal.tsx
│   │   ├── CurrentGeneration.tsx
│   │   ├── CurrentTraining.tsx
│   │   ├── DatasetCard.tsx
│   │   ├── DatasetItemModal.tsx
│   │   ├── DatasetItemRow.tsx
│   │   ├── DialogEditor.tsx
│   │   ├── DialogPreview.tsx
│   │   ├── GenerationForm.tsx
│   │   ├── GenerationHistory.tsx
│   │   ├── ImportDatasetModal.tsx
│   │   ├── InlineAudioPlayer.tsx
│   │   ├── LayoutWrapper.tsx
│   │   ├── Navigation.tsx
│   │   ├── PresetVoiceManager.tsx
│   │   ├── PresetVoiceSelector.tsx
│   │   ├── ProjectSelector.tsx
│   │   ├── QuickGenerateHistory.tsx
│   │   ├── QuickGenerateNavigation.tsx
│   │   ├── SessionManager.tsx
│   │   ├── SpeakerList.tsx
│   │   ├── SpeakerRoleManager.tsx
│   │   ├── SpeakerSelector.tsx
│   │   ├── TextEditor.tsx
│   │   ├── TrainingForm.tsx
│   │   ├── TrainingHistory.tsx
│   │   ├── TrainingMetricsChart.tsx
│   │   ├── VoicePreview.tsx
│   │   └── VoiceRecorder.tsx
│   ├── eslint.config.mjs
│   ├── lib/
│   │   ├── DatasetContext.tsx
│   │   ├── DatasetItemsContext.tsx
│   │   ├── GenerationContext.tsx
│   │   ├── GlobalTaskContext.tsx
│   │   ├── PresetVoiceContext.tsx
│   │   ├── ProjectContext.tsx
│   │   ├── SessionContext.tsx
│   │   ├── SpeakerRoleContext.tsx
│   │   ├── TrainingContext.tsx
│   │   ├── api.ts
│   │   ├── audioUtils.ts
│   │   └── i18n/
│   │       ├── LanguageContext.tsx
│   │       ├── config.ts
│   │       └── locales/
│   │           ├── en.json
│   │           └── zh.json
│   ├── next.config.ts
│   ├── package.json
│   ├── postcss.config.mjs
│   ├── public/
│   │   ├── icon-preview.html
│   │   ├── icon-rect-preview.html
│   │   └── site.webmanifest
│   ├── scripts/
│   │   └── generate-version.js
│   ├── tsconfig.json
│   └── types/
│       ├── dialog.ts
│       ├── generation.ts
│       ├── preset.ts
│       ├── project.ts
│       ├── quickGenerate.ts
│       ├── speaker.ts
│       ├── task.ts
│       └── training.ts
├── pyproject.toml
├── rebuild.sh
├── test_generation_offloading.py
├── test_offloading.py
├── tests/
│   ├── test_logging.py
│   ├── test_lora_network.py
│   └── test_training_service.py
├── tokenizer/
│   ├── tokenizer.json
│   ├── tokenizer_config.json
│   └── vocab.json
├── util/
│   ├── LOGGING_README.md
│   ├── __init__.py
│   ├── float8_scale.py
│   ├── logger.py
│   ├── logger_examples.py
│   ├── model_utils.py
│   ├── rand_init.py
│   ├── safetensors_util.py
│   └── vibevoice_norm.py
└── vibevoice/
    ├── __init__.py
    ├── configs/
    │   ├── qwen2.5_1.5b_64k.json
    │   └── qwen2.5_7b_32k.json
    ├── generation/
    │   ├── __init__.py
    │   └── visitor.py
    ├── lora/
    │   ├── __init__.py
    │   └── lora_network.py
    ├── modular/
    │   ├── __init__.py
    │   ├── adaptive_offload.py
    │   ├── custom_offloading_utils.py
    │   ├── modeling_vibevoice.py
    │   ├── modeling_vibevoice_inference.py
    │   ├── modular_vibevoice_diffusion_head.py
    │   ├── modular_vibevoice_qwen.py
    │   ├── modular_vibevoice_text_tokenizer.py
    │   ├── modular_vibevoice_tokenizer.py
    │   └── streamer.py
    ├── processor/
    │   ├── __init__.py
    │   ├── vibevoice_processor.py
    │   └── vibevoice_tokenizer_processor.py
    ├── schedule/
    │   ├── __init__.py
    │   ├── dpm_solver.py
    │   └── timestep_sampler.py
    ├── scripts/
    │   ├── __init__.py
    │   └── convert_nnscaler_checkpoint_to_transformers.py
    └── training/
        ├── dataset.py
        ├── fake_trainer.py
        ├── summary_visitor.py
        ├── trainer.py
        └── trainer_visitor.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
# This .dockerignore is mostly irrelevant since we don't COPY from local context
# The Dockerfile clones everything from GitHub
# However, we keep this to prevent accidentally copying if someone modifies the Dockerfile

# Everything - we clone from GitHub instead
*

# Exception: Allow Dockerfile itself for reference (not actually copied in our Dockerfile)
!Dockerfile


================================================
FILE: .gitignore
================================================
.claude
CLAUDE.md
.vscode/
uv.lock
media/


demo/example/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# UV
#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#uv.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/

# pixi
#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
#   in the .venv directory. It is recommended not to include this directory in version control.
.pixi

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/

# Visual Studio Code
#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
#  you could uncomment the following to ignore the entire vscode folder
# .vscode/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc

# Cursor
#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
#  refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore

# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
models
*.txt
*.pt
*.swp
*.safetensors
outputs/

# Backend
backend/.env
backend/uploads/
backend/__pycache__/
backend/**/__pycache__/
!backend/models/

!frontend/lib/

# Workspace
workspace/
outputs_offloading/
demo/datasets/
tensorboard_logs/
nohup.out

docs/images/vibevoice_architecture_1.svg


================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to VibeVoice will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v2.2.0] - 2026-01-09

### Added

- **Quick Generation**: Simplified voice generation workflow without project context
  - One-click generation with preset voice selection
  - Support for multiple voice prompt files
  - Multiple prompt voices displayed in generation history detail view
  - Task indicator for quick generation status
  - Per-item progress tracking for each generating voice

- **Navigation Enhancement**: Click logo to return to home page

### Fixed

- Task indicator display bugs in quick generation
- Card status not updating in time
- Styling consistency for deleting generation history

---

## [v2.1.0] - 2025-12-28

### Added

- **Narration Mode Editor**: New editing mode for single-speaker narration content
  - Support for changing narrator
  - Plain text editing without speaker prefixes

- **Preset Voices Management**: Manage preset voice samples for quick speaker creation

- **Auto Version Generation**: Frontend version automatically generated from git

### Fixed

- Offloading config save issue during inference
- Offloading config error
- Duplicated speaker ID issue
- Missing tags in Docker repository
- Dockerfile dependency errors

---

## [v2.0.0] - 2025-12-19

### Added

- **Fine-Tuning Support**: Full LoRA training workflow with real-time metrics
  - Training page with live progress bars and configuration options
  - Training metrics charts (Loss/LR/Timing) with 5-second auto-refresh
  - Support for layer offloading presets (Balanced/Aggressive/Extreme)
  - Gradient accumulation steps and checkpoint saving per epoch
  - TensorBoard metrics reader for training visualization

- **Dataset Management**: Complete dataset CRUD operations
  - Dataset list and detail pages with pagination
  - Import/Export functionality for datasets
  - JSONL format for efficient line-by-line operations
  - Scripts for generating datasets from Mozilla Common Voice and KeSpeech

- **Multi-Generation**: Batch generation with different random seeds
  - Generate 2-20 audio variations in a single request
  - Per-item progress tracking with individual audio players
  - Expandable history view with aggregate statistics

- **LoRA Inference**: Apply trained LoRA models during voice generation
  - Select LoRA model from training output directory
  - Configurable LoRA weight (0-1]

- **Unified Task API**: Single endpoint for checking any running task (inference or training)

- **Preset Voice Feature**: Quick speaker creation from preset voice samples

- **Audio Denoising**: Scripts for audio denoising with DeepFilter

- **Dataset Processing Scripts**:
  - Script for ASR-SCCantDuSC (Scripted Chinese Cantonese Daily-use Speech Corpus)
  - Script for Mozilla Common Voice datasets

### Changed

- Improved training completion UI with better status display
- Enhanced training history list to display all information regardless of success/failure
- Better estimated training time calculation
- Increased file upload limits (500MB configurable)
- Project-scoped current generation and training API endpoints

### Fixed

- Training metadata update error
- Generated voices having same name in batch generation
- Seeds reset issue with multi-generation
- CUDA resource cleanup when training finishes or fails
- Invalid audio and voice_prompts field values in datasets.jsonl
- Delete training history validation
- OOM error handling with specific error messages
- Various npm build errors and UI style issues

## [v1.0.0] - 2025-11-14

### Added

- **Core TTS Model**: AR + diffusion architecture for multi-speaker text-to-speech synthesis
  - Float8 inference support for optimized performance
  - Mono model file inference support

- **Full-Stack Web Application**:
  - Next.js frontend with responsive UI
  - Flask backend with RESTful API
  - Static export for production deployment

- **Project Management**: Create and manage multiple voice projects

- **Speaker Voice Management**:
  - Upload and manage speaker voices
  - Voice recording directly in browser
  - Auto-assigned speaker names ("Speaker 1", etc.)

- **Dialog Editor**: 4-panel layout for creating and editing dialog sessions
  - Clickable session names in generation history
  - Session navigation to voice editor

- **Voice Generation**:
  - Live progress monitoring
  - Generation history with pagination
  - Audio playback and download
  - Task icon notification in navigation

- **Layer Offloading**: VRAM optimization for GPU memory constraints
  - Configurable number of layers for GPU/CPU
  - Async transfers with ThreadPoolExecutor
  - Smart cache clearing for performance

- **Internationalization (i18n)**: Full bilingual support
  - English and Chinese languages
  - Auto-detection via browser settings
  - Persistence in localStorage

- **Docker Support**:
  - Dockerfile for containerized deployment
  - GPU support with nvidia-docker

- **Documentation**:
  - Comprehensive API documentation
  - Architecture diagrams with Mermaid
  - Offloading configuration guide

### Fixed

- Invisible text color in browser dark theme
- Frontend project selection issues
- Refresh page navigation bugs
- Layout issues in various components
- Scripts not starting with ID 1
- Various typos and documentation errors

[v2.2.0]: https://github.com/zhao-kun/vibevoice/compare/v2.1.0...v2.2.0
[v2.1.0]: https://github.com/zhao-kun/vibevoice/compare/v2.0.0...v2.1.0
[v2.0.0]: https://github.com/zhao-kun/vibevoice/compare/v1.0.0...v2.0.0
[v1.0.0]: https://github.com/zhao-kun/vibevoice/releases/tag/v1.0.0


================================================
FILE: CHANGELOG_zh.md
================================================
# 更新日志

VibeVoice 的所有重要更改都将记录在此文件中。

本文档格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/),
并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。

## [v2.2.0] - 2026-01-09

### 新增

- **快速生成**: 无需项目上下文的简化语音生成流程
  - 一键生成,支持预设音色选择
  - 支持多个音色提示文件
  - 生成历史详情中显示多个提示音色
  - 快速生成任务状态指示器
  - 每个生成语音的独立进度跟踪

- **导航增强**: 点击 Logo 返回首页

### 修复

- 快速生成中的任务指示器显示问题
- 卡片状态未及时更新
- 删除生成历史的样式一致性问题

---

## [v2.1.0] - 2025-12-28

### 新增

- **旁白模式编辑器**: 单人朗读内容的新编辑模式
  - 支持切换朗读者
  - 无需说话人前缀的纯文本编辑

- **预设音色管理**: 管理预设音色样本,快速创建说话人

- **自动版本生成**: 前端版本号自动从 git 生成

### 修复

- 推理过程中的 Offloading 配置保存问题
- Offloading 配置错误
- 说话人 ID 重复问题
- Docker 仓库缺少标签
- Dockerfile 依赖错误

---

## [v2.0.0] - 2025-12-19

### 新增

- **微调支持**: 完整的 LoRA 训练工作流,支持实时指标监控
  - 训练页面,带有实时进度条和配置选项
  - 训练指标图表(Loss/LR/Timing),5 秒自动刷新
  - 支持层卸载预设(均衡/激进/极限)
  - 梯度累积步数和每轮检查点保存
  - TensorBoard 指标读取器用于训练可视化

- **数据集管理**: 完整的数据集 CRUD 操作
  - 数据集列表和详情页面,支持分页
  - 数据集导入/导出功能
  - JSONL 格式,高效逐行操作
  - 从 Mozilla Common Voice 和 KeSpeech 生成数据集的脚本

- **批量生成**: 使用不同随机种子批量生成
  - 单次请求生成 2-20 个音频变体
  - 每个项目的进度跟踪和独立音频播放器
  - 可展开的历史视图,显示汇总统计

- **LoRA 推理**: 在语音生成时应用训练好的 LoRA 模型
  - 从训练输出目录选择 LoRA 模型
  - 可配置的 LoRA 权重 (0-1]

- **统一任务 API**: 单一接口检查任何运行中的任务(推理或训练)

- **预设音色功能**: 从预设音色样本快速创建说话人

- **音频降噪**: 使用 DeepFilter 的音频降噪脚本

- **数据集处理脚本**:
  - ASR-SCCantDuSC(粤语日常用语语音语料库)处理脚本
  - Mozilla Common Voice 数据集处理脚本

### 变更

- 改进训练完成 UI,更好的状态显示
- 增强训练历史列表,无论成功或失败都显示所有信息
- 更准确的预估训练时间计算
- 增加文件上传限制(500MB,可配置)
- 项目范围的当前生成和训练 API 端点

### 修复

- 训练元数据更新错误
- 批量生成中生成的语音名称相同
- 批量生成时种子重置问题
- 训练完成或失败时的 CUDA 资源清理
- datasets.jsonl 中无效的音频和 voice_prompts 字段值
- 删除训练历史的验证问题
- OOM 错误处理,提供具体错误信息
- 各种 npm 构建错误和 UI 样式问题

## [v1.0.0] - 2025-11-14

### 新增

- **核心 TTS 模型**: AR + 扩散架构的多说话人文本转语音合成
  - Float8 推理支持,优化性能
  - 单体模型文件推理支持

- **全栈 Web 应用**:
  - Next.js 前端,响应式 UI
  - Flask 后端,RESTful API
  - 静态导出用于生产部署

- **项目管理**: 创建和管理多个语音项目

- **说话人音色管理**:
  - 上传和管理说话人音色
  - 浏览器内直接录音
  - 自动分配说话人名称("说话人 1" 等)

- **对话编辑器**: 四面板布局,创建和编辑对话会话
  - 生成历史中可点击的会话名称
  - 会话导航到语音编辑器

- **语音生成**:
  - 实时进度监控
  - 生成历史,支持分页
  - 音频播放和下载
  - 导航栏任务图标通知

- **层卸载**: GPU 显存优化
  - 可配置的 GPU/CPU 层数
  - ThreadPoolExecutor 异步传输
  - 智能缓存清理以提升性能

- **国际化 (i18n)**: 完整的双语支持
  - 英语和中文
  - 通过浏览器设置自动检测
  - localStorage 持久化

- **Docker 支持**:
  - Dockerfile 容器化部署
  - nvidia-docker GPU 支持

- **文档**:
  - 完整的 API 文档
  - Mermaid 架构图
  - 层卸载配置指南

### 修复

- 浏览器深色主题中不可见的文字颜色
- 前端项目选择问题
- 页面刷新导航问题
- 各组件布局问题
- 脚本未从 ID 1 开始
- 各种拼写错误和文档错误

[v2.2.0]: https://github.com/zhao-kun/vibevoice/compare/v2.1.0...v2.2.0
[v2.1.0]: https://github.com/zhao-kun/vibevoice/compare/v2.0.0...v2.1.0
[v2.0.0]: https://github.com/zhao-kun/vibevoice/compare/v1.0.0...v2.0.0
[v1.0.0]: https://github.com/zhao-kun/vibevoice/releases/tag/v1.0.0


================================================
FILE: Dockerfile
================================================
# Multi-stage Dockerfile for VibeVoice
# This Dockerfile is completely self-contained and requires no local source code
# All source code is cloned from GitHub during build

ARG BASE_IMAGE=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
ARG WORKDIR=/workspace/zhao-kun/vibevoice
ARG GITHUB_REPO=https://github.com/zhao-kun/vibevoice.git
ARG GITHUB_BRANCH=main

#############################################
# Stage 1: Download model from HuggingFace
#############################################
FROM python:3.10-slim AS model-downloader

# Install huggingface-cli
RUN pip install --no-cache-dir huggingface-hub[cli]

# Set working directory
WORKDIR /tmp/models

# Download model (float8_e4m3fn only) using huggingface-cli
RUN hf download zhaokun/vibevoice-large \
    vibevoice7b_float8_e4m3fn.safetensors \
    --local-dir /tmp/models/VibeVoice-large

RUN hf download zhaokun/vibevoice-large \
    vibevoice7b_bf16.safetensors \
    --local-dir /tmp/models/VibeVoice-large

#############################################
# Stage 2: Clone Repository and Build Frontend
#############################################
FROM node:20-alpine AS source-and-frontend

ARG GITHUB_REPO
ARG GITHUB_BRANCH
ARG CACHE_BUST=unknown

# Install git
RUN apk add --no-cache git

# Cache bust: Force rebuild from here when CACHE_BUST changes
RUN echo "Cache bust: ${CACHE_BUST}"

# Clone repository (shallow clone, then unshallow to get full history for git describe)
WORKDIR /build
RUN git clone --depth 1 --branch ${GITHUB_BRANCH} ${GITHUB_REPO} vibevoice && \
    cd /build/vibevoice && \
    git fetch --unshallow origin && \
    git fetch --tags origin && \
    git checkout main && \
    git rev-parse HEAD > backend/version.txt

# Build frontend
WORKDIR /build/vibevoice/frontend

# Install dependencies
RUN npm ci

# Build frontend
RUN npm run build

# Verify build output
RUN ls -la out/

#############################################
# Stage 3: Create Python Virtual Environment
#############################################
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS python-builder

ARG WORKDIR=/workspace/zhao-kun/vibevoice
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update --allow-releaseinfo-change --yes && \
    apt-get upgrade --yes && \
    apt install --yes --no-install-recommends \
    bash \
    libgl1 \
    software-properties-common \
    ffmpeg \
    zip \
    unzip \
    iputils-ping \
    libtcmalloc-minimal4 \
    net-tools \
    vim \
    p7zip-full && \
    rm -rf /var/lib/apt/lists/*

RUN add-apt-repository ppa:deadsnakes/ppa

RUN apt-get update --allow-releaseinfo-change --yes && \
    apt install python3.10-dev python3.10-venv python3-pip \
    build-essential git curl -y --no-install-recommends && \
    ln -s /usr/bin/python3.10 /usr/bin/python && \
    rm /usr/bin/python3 && \
    ln -s /usr/bin/python3.10 /usr/bin/python3 && \
    apt-get clean && rm -rf /var/lib/apt/lists/* && \
    echo "en_US.UTF-8 UTF-8" > /etc/locale.gen

# Create working directory at EXACT runtime path
RUN mkdir -p ${WORKDIR}
WORKDIR ${WORKDIR}

# Copy source code from frontend stage
COPY --from=source-and-frontend /build/vibevoice .

# Create virtual environment at runtime path (critical for absolute paths in venv)
RUN python3.10 -m venv ${WORKDIR}/venv

# Upgrade pip and install dependencies
RUN ${WORKDIR}/venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
    ${WORKDIR}/venv/bin/pip install --no-cache-dir .

RUN rm -rf ${WORKDIR}/frontend

#############################################
# Stage 4: Final Image
#############################################
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder

ARG WORKDIR=/workspace/zhao-kun/vibevoice
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1

RUN apt-get update --allow-releaseinfo-change --yes && \
    apt-get upgrade --yes && \
    apt install --yes --no-install-recommends \
    bash \
    libgl1 \
    software-properties-common \
    ffmpeg \
    zip \
    unzip \
    iputils-ping \
    libtcmalloc-minimal4 \
    net-tools \
    vim \
    p7zip-full && \
    rm -rf /var/lib/apt/lists/*

RUN add-apt-repository ppa:deadsnakes/ppa

RUN apt-get update --allow-releaseinfo-change --yes && \
    apt install python3.10-dev python3.10-venv python3-pip \
    build-essential git curl -y --no-install-recommends && \
    ln -s /usr/bin/python3.10 /usr/bin/python && \
    rm /usr/bin/python3 && \
    ln -s /usr/bin/python3.10 /usr/bin/python3 && \
    apt-get clean && rm -rf /var/lib/apt/lists/* && \
    echo "en_US.UTF-8 UTF-8" > /etc/locale.gen


# Copy downloaded model from model-downloader stage
RUN mkdir -p /tmp/models/
COPY --from=model-downloader /tmp/models/VibeVoice-large /tmp/models

# Create working directory at EXACT same path as build stage
RUN mkdir -p ${WORKDIR}
WORKDIR ${WORKDIR}
# Copy virtual environment from python-builder stage (with preserved absolute paths)
COPY --from=python-builder ${WORKDIR} .

RUN mkdir -p ${WORKDIR}/models/vibevoice/  && ln -s /tmp/models/vibevoice7b_float8_e4m3fn.safetensors ${WORKDIR}/models/vibevoice/
RUN mkdir -p ${WORKDIR}/models/vibevoice/  && ln -s /tmp/models/vibevoice7b_bf16.safetensors ${WORKDIR}/models/vibevoice/

# Copy frontend build from source-and-frontend stage
RUN mkdir -p ${WORKDIR}/backend/dist
COPY --from=source-and-frontend /build/vibevoice/frontend/out ${WORKDIR}/backend/dist

# Create workspace directory for runtime data
RUN mkdir -p ${WORKDIR}/workspace

# Expose port
EXPOSE 9527

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:9527/health || exit 1

# Use venv python explicitly (critical - do not rely on PATH)
CMD ["/workspace/zhao-kun/vibevoice/venv/bin/python", "backend/run.py"]



================================================
FILE: README.md
================================================
# VibeVoiceFusion

<div align="center">

<img src="frontend/public/icon-rect-pulse.svg" alt="VibeVoiceFusion Logo" width="120"/>

**A Complete Web Application for Multi-Speaker Voice Generation**

*Built on Microsoft's VibeVoice Model*

[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
[![Python](https://img.shields.io/badge/python-3.9+-blue.svg?logo=python)](https://www.python.org/)
[![TypeScript](https://img.shields.io/badge/typescript-5.0+-blue.svg?logo=typescript)](https://www.typescriptlang.org/)
[![Docker](https://img.shields.io/badge/docker-ready-brightgreen.svg)](Dockerfile)
[![Docker Hub](https://img.shields.io/badge/Docker%20Hub-vibevoicefusion-blue?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)
[![Docker Pulls](https://img.shields.io/docker/pulls/zhaokundev/vibevoicefusion?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)
[![Image Size](https://img.shields.io/docker/image-size/zhaokundev/vibevoicefusion/latest?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)

[English](README.md) | [简体中文](README_zh.md)

[Features](#features) • [Demo Samples](#demo-samples) • [Get Started](#get-started) • [Documentation](#documentation) • [Community](#community) • [Contributing](#contributing)

</div>

---

## Overview

### Purpose

VibeVoiceFusion is a **web application** for generating high-quality, multi-speaker synthetic speech with voice cloning capabilities. Built on Microsoft's VibeVoice model (AR + diffusion architecture), this project provides a complete full-stack solution with voice generation, LoRA fine-tuning, dataset management, batch generation, and advanced VRAM optimization features.

**Key Goals:**
- Provide a user-friendly interface for voice generation without requiring coding knowledge
- Enable efficient multi-speaker dialog synthesis with distinct voice characteristics
- Support LoRA fine-tuning for custom voice adaptation and style transfer
- Generate multiple audio variations in batch with different random seeds
- Optimize memory usage for consumer-grade GPUs (10GB+ VRAM)
- Support bilingual workflows (English/Chinese)
- Offer both web UI and CLI interfaces for different use cases

<div align="center">
  <a href="https://youtu.be/J9pmcOBWN4c" target="_blank">
    <img src="docs/images/VibevoiceFusion.png" alt="Video Introduction" width="700"/>
  </a>
</div>

### Principle

VibeVoice combines **autoregressive (AR)** and **diffusion** techniques for text-to-speech synthesis:

1. **Text Processing**: Input text is tokenized and processed through a Qwen-based language model backbone
2. **Voice Encoding**: Reference voice samples are encoded into acoustic and semantic embeddings
3. **AR Generation**: The model autoregressively generates speech tokens conditioned on text and voice embeddings
4. **Diffusion Refinement**: A DPM-Solver-based diffusion head converts tokens to high-quality audio waveforms
5. **Voice Cloning**: The unified processor preserves speaker characteristics from reference audio samples

**Technical Highlights:**
- **Model Architecture**: Qwen backbone + VAE acoustic tokenizer + semantic encoder + diffusion head
- **Quantization**: Float8 (FP8 E4M3FN) support for ~50% VRAM reduction with minimal quality loss
- **Layer Offloading**: Dynamic CPU/GPU memory management for running on limited VRAM
- **Attention Mechanism**: PyTorch native SDPA for maximum compatibility

### Features

#### Quick Generation

- **One-Click Generation**: Generate voice without creating projects, speakers, or sessions
- **Voice Source Options**:
  - Upload custom audio files (WAV, MP3, M4A, FLAC, WebM) - up to 4 files
  - Select from preset voice samples with language/gender filters
- **Auto Mode Detection**: Automatically detects dialogue vs narration format
- **Multi-Voice Support**: Use up to 4 voice prompts for generation
- **Generation History**: Persistent history with expandable details, bulk delete
- **Per-Item Progress**: Real-time progress tracking for each generating voice

#### Complete Web Application

- **Project Management**: Organize voice generation projects with metadata and descriptions
- **Speaker/Voice Management**:
  - Upload and manage reference voice samples (WAV, MP3, M4A, FLAC, WebM)
  - Audio preview with playback controls
  - Voice file replacement with automatic cache-busting
  - Audio trimming functionality
- **Dialog Editor**:
  - Visual editor with drag-and-drop line reordering
  - Text editor mode for bulk editing
  - Support for multi-speaker dialogs (up to 4+ speakers)
  - **Narration mode** for single-speaker content (audiobooks, articles, podcasts)
  - Real-time preview and validation
- **Generation System**:
  - Queue-based task management (prevents GPU conflicts)
  - Real-time progress monitoring with live updates
  - Configurable parameters (CFG scale, random seed, model precision)
  - **Multi-Generation**: Generate 2-20 audio variations in a single batch with different seeds
  - LoRA model support with configurable weight (0-1]
  - Generation history with filtering, sorting, and pagination
  - Audio playback and download for completed generations

#### LoRA Fine-Tuning

- **Dataset Management**:
  - Create and manage training datasets with audio/text pairs
  - Import datasets from ZIP archives or local folders
  - JSONL format for efficient data handling
  - Pagination and search for large datasets
  - Export datasets for backup or sharing

- **Training System**:
  - LoRA (Low-Rank Adaptation) fine-tuning for voice customization
  - Configurable training parameters (epochs, learning rate, LoRA rank, batch size)
  - Layer offloading support for training on consumer GPUs
  - Real-time training progress with tqdm-style progress bar
  - Live training metrics charts (Loss, Learning Rate, Timing)
  - TensorBoard integration for detailed metrics
  - Training history with status tracking (Prepare, Training, Completed, Failed)
  - OOM detection with helpful suggestions for recovery

- **LoRA Model Usage**:
  - Select trained LoRA models during voice generation
  - Configurable LoRA weight for blending with base model
  - Multiple LoRA files per training job (epoch checkpoints + final)

#### VRAM Optimization

- **Layer Offloading**: Move transformer layers between CPU/GPU to reduce VRAM requirements
  - **Balanced** (12 GPU / 16 CPU layers): ~5GB VRAM savings, ~2.0x slower - RTX 3060 16GB, 4070
  - **Aggressive** (8 GPU / 20 CPU layers): ~6GB VRAM savings, ~2.5x slower - RTX 3060 12GB, 4060
  - **Extreme** (4 GPU / 24 CPU layers): ~7GB VRAM savings, ~3.5x slower - RTX 3060 10GB (minimum)
- **Float8 Quantization**: Reduce model size from ~14GB to ~7GB with comparable quality. (Supported by RTX 40 series and above graphics cards.)
- **Adaptive Configuration**: Automatic VRAM estimation and optimal layer distribution

**VRAM Requirements:**

| Configuration | GPU Layers | VRAM Usage | Speed | Target Hardware |
|--------------|-----------|------------|-------|-----------------|
| No offloading | 28 | 11-14GB | 1.0x | RTX 4090, A100, 3090 |
| Balanced | 12 | 6-8GB | 0.70x | RTX 4070, 3080 16GB |
| Aggressive | 8 | 5-7GB | 0.55x | RTX 3060 12GB |
| Extreme | 4 | 4-5GB | 0.40x | RTX 3080 10GB |

> Float8 Quantization only supports by RTX 40XX or 50XX serial nvidia card.

#### Internationalization

- **Full Bilingual Support**: Complete English/Chinese UI with 360+ translation keys
- **Auto-Detection**: Automatically detects browser language on first visit
- **Persistent Preference**: Language selection saved in localStorage
- **Backend i18n**: API error messages and responses translated to user's language

#### Docker Deployment

- **Multi-Stage Build**: Optimized Dockerfile with frontend build, Python venv, and model download
- **Self-Contained**: Clones from GitHub and builds entirely from source
- **HuggingFace Integration**: Automatically downloads model file (~3-4GB) during build

#### Additional Features

- **Responsive Design**: Mobile-friendly interface with Tailwind CSS
- **Real-Time Updates**: WebSocket-free polling with smart update intervals (2s active, 60s background)
- **Audio Cache-Busting**: Ensures audio updates are immediately reflected
- **Toast Notifications**: User-friendly feedback for all operations
- **Dark Mode Ready**: Modern UI with consistent styling
- **Accessibility**: Keyboard navigation and ARIA labels

---

## Demo Samples

Listen to voice generation samples created with VibeVoiceFusion. Click the links below to download and play:

### Single Speaker

**🎧 [Pandora's Box Story (BFloat16 Model)](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/1p_pandora_box_bf16.wav)**

*Generated with bfloat16 precision model - Full quality, 14GB VRAM*

**🎧 [Pandora's Box Story (Float8 Model)](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/1p_pandora_box_float8_e4m3fn.wav)**

*Generated with float8 quantization - Optimized for 7GB VRAM with comparable quality*

### Multi-Speaker (3 Speakers)

**🎭 [东邪西毒 - 西游版 (Journey to the West Version)](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/东邪西毒-西游版.wav)**

*Multi-speaker dialog with distinct voice characteristics for each character*

---

## Get Started

### Prerequisites

- **Python**: 3.9 or higher
- **Node.js**: 16.x or higher (for frontend development)
- **CUDA**: Compatible GPU with CUDA support (recommended)
- **VRAM**: Minimum 6GB for extreme offloading, 14GB recommended for best performance
- **Docker**: Optional, for containerized deployment

### Installation

#### Option 1: Docker (Recommended for Production)

Build docker image 
```bash
# Clone the repository
git clone https://github.com/zhao-kun/vibevoicefusion.git
cd vibevoicefusion
# Build and the docker image
docker compose build vibevoice
```

After build successfully, run command:

```bash
docker run -d \
  --name vibevoicefusion \
  --gpus all \
  -p 9527:9527 \
  -v $(pwd)/workspace:/workspace/zhao-kun/vibevoice/workspace \
  zhaokundev/vibevoicefusion:latest
```

Access the application at `http://localhost:9527`

**The Docker image is available on Docker Hub, and you can launch VibeVoiceFusion using the following command.**

```bash
docker pull zhaokundev/vibevoicefusion
docker run -d \
  --name vibevoicefusion \
  --gpus all \
  -p 9527:9527 \
  -v $(pwd)/workspace:/workspace/zhao-kun/vibevoice/workspace \
  zhaokundev/vibevoicefusion:latest
```

**Build Time**: 18-28 minutes | **Image Size**: ~12-15GB

#### Option 2: Manual Installation

**1. Install Backend Dependencies**

```bash
# Clone the repository
git clone https://github.com/zhao-kun/vibevoice.git
cd vibevoice

# Install Python package
pip install -e .
```

**2. Download Pre-trained Model**

Download from HuggingFace (choose one):
- **Float8 (Recommended)**: [vibevoice7b_float8_e4m3fn.safetensors](https://huggingface.co/zhaokun/vibevoice-large/blob/main/vibevoice7b_float8_e4m3fn.safetensors) (~7GB) (Supported by RTX 40 series and above graphics cards.)
- **BFloat16 (Full Precision)**: [vibevoice7b_bf16.safetensors](https://huggingface.co/zhaokun/vibevoice-large/blob/main/vibevoice7b_bf16.safetensors) (~14GB)
- **Config**: [config.json](https://huggingface.co/zhaokun/vibevoice-large/blob/main/config.json)

Place files in `./models/vibevoice/`

**3. Install Frontend Dependencies** (for development)

```bash
cd frontend
npm install
```

**4. Build Frontend** (for production)

```bash
cd frontend
npm run build
cp -r out/* ../backend/dist/
```

### Usage

#### Web Application (Recommended)

**Production Mode** (single server):
```bash
# Start backend server (serves both API and frontend)
python backend/run.py

# Access at http://localhost:9527
```

**Development Mode** (separate servers):
```bash
# Terminal 1: Start backend API
python backend/run.py  # http://localhost:9527

# Terminal 2: Start frontend dev server
cd frontend
npm run dev  # http://localhost:3000
```

### Quick Generation (No Project Required)

For quick testing without setting up projects:

1. Click **"Quick Generate"** from the home page
2. **Select Voice Source**:
   - **Upload**: Drag & drop your audio file (supports up to 4 files)
   - **Preset**: Choose from preset voices with language/gender filters
3. **Enter Text**: Type dialogue (`Speaker 1: Hello`) or narration (plain text)
4. **Configure**: Set seed, batch size (2-20), and offloading options
5. **Generate**: Click generate and monitor per-item progress
6. **Download**: Play or download generated audio

**Key Points:**
- Auto-detects dialogue vs narration mode
- All speakers use the same voice in dialogue mode
- No LoRA support (base model only)
- History persists across sessions

---

### Complete Workflow Guide

This guide walks you through the complete process of creating multi-speaker voice generation from start to finish.

#### Step 1: Create a Project

Start by creating a new project or selecting an existing one. Projects help organize your voice generation work with metadata and descriptions.

<div align="center">
<img src="docs/images/home-eng.png" alt="Project Management" width="700"/>
<p><i>Create and manage projects from the home page</i></p>
</div>

**Actions:**
- Click "Create New Project" card
- Enter a project name (e.g., "Podcast Episode 1")
- Optionally add a description
- Click "Create Project"

The project will be automatically selected and you'll be navigated to the Speaker Role page.

#### Step 2: Add Speakers and Upload Voice Samples

Upload reference voice samples for each speaker. The system supports various audio formats (WAV, MP3, M4A, FLAC, WebM).

<div align="center">
<img src="docs/images/speaker-role-eng.png" alt="Speaker Management" width="700"/>
<p><i>Upload and manage voice samples for each speaker</i></p>
</div>

**Actions:**
- Click "Add New Speaker" button
- The speaker will be automatically named (e.g., "Speaker 1", "Speaker 2")
- Click "Upload Voice" to select a reference audio file (3-30 seconds recommended)
- Preview the uploaded voice using the audio player
- Repeat for additional speakers (supports 2-4+ speakers)

**Tips:**
- Use clean audio with minimal background noise
- 5-15 seconds of speech is ideal for voice cloning
- Each speaker needs a unique voice sample
- You can replace voice files later by clicking "Change Voice"

#### Step 3: Create and Edit Dialog

Create a dialog session and write the multi-speaker conversation. The dialog editor supports drag-and-drop reordering and real-time preview.

<div align="center">
<img src="docs/images/voice-editor-eng.png" alt="Dialog Editor" width="700"/>
<p><i>Multi-speaker dialog editor with visual and text modes</i></p>
</div>

**Actions:**
- Click "Create New Session" in the session list
- Enter a session name (e.g., "Chapter 1")
- In the dialog editor, add lines for each speaker:
  - Select a speaker from the dropdown
  - Enter the dialog text
  - Click "Add Line" or press Enter
- Reorder lines by dragging the handle icons
- Use "Text Editor" mode for bulk editing
- Click "Save" to persist your changes

**Dialog Format (Text Mode):**
```
Speaker 1: Welcome to our podcast!

Speaker 2: Thanks for having me. It's great to be here.

Speaker 1: Let's dive into today's topic.
```

**Narration Mode:**

For single-speaker content like audiobooks, articles, or podcasts, use **Narration Mode**:

1. When creating a new session, toggle to "Narration" mode
2. Select a narrator voice from your uploaded speakers
3. Enter plain text without `Speaker N:` prefixes
4. Each paragraph will be spoken by the selected narrator

```
This is the first paragraph of your narration.

This is the second paragraph. No speaker formatting needed.

The narrator voice you selected will read all the text.
```

**Features:**
- Visual editor with drag-and-drop
- Text editor for bulk editing
- Real-time preview
- Copy and download functionality
- Format validation
- **Narration mode** for single-speaker content

#### Step 4: Generate Voice

Configure generation parameters and start the voice synthesis process. Monitor real-time progress and manage generation history.

<div align="center">
<img src="docs/images/generate-voice-eng.png" alt="Voice Generation" width="700"/>
<p><i>Generation interface with parameters, live progress, and history</i></p>
</div>

**Actions:**
- Navigate to "Generate Voice" page
- Select a dialog session from the dropdown
- Configure parameters:
  - **Model Type**:
    - `float8_e4m3fn` (recommended): 7GB VRAM, faster loading
    - `bfloat16`: 14GB VRAM, full precision
  - **CFG Scale** (1.0-2.0): Controls generation adherence to text
    - Lower (1.0-1.3): More natural, varied
    - Higher (1.5-2.0): More controlled, may sound robotic
    - Default: 1.3
  - **Random Seed**: Any positive integer for reproducibility
  - **Offloading** (optional): Enable if VRAM < 14GB
    - **Balanced**: 12 GPU layers, ~5GB savings, 2.0x slower (RTX 3070 12GB, 4070)
    - **Aggressive**: 8 GPU layers, ~6GB savings, 2.5x slower (RTX 3080 12GB)
    - **Extreme**: 4 GPU layers, ~7GB savings, 3.5x slower (minimum 10GB VRAM)
- Click "Start Generation"

**Real-Time Monitoring:**
- Progress bar shows completion percentage
- Phase indicators: Preprocessing → Inferencing → Saving
- Live token generation count
- Estimated time remaining

<div align="center">
<img src="docs/images/generating-voice-eng.png" alt="Voice Generation" width="700"/>
<p><i>Generation interface with parameters, live progress, and history</i></p>
</div>

**Generation History:**
- View all past generations with status (completed, failed, running)
- Filter and sort by date, status, or session
- Play generated audio inline
- Download WAV files
- Delete unwanted generations
- View detailed metrics (tokens, duration, RTF, VRAM usage)

#### Command-Line Interface

For CLI-based generation without the web UI:

```bash
python demo/local_file_inference.py \
    --model_file ./models/vibevoice/vibevoice7b_float8_e4m3fn.safetensors \
    --txt_path demo/text_examples/1p_pandora_box.txt \
    --speaker_names zh-007 \
    --output_dir ./outputs \
    --dtype float8_e4m3fn \
    --cfg_scale 1.3 \
    --seed 42
```

**CLI Arguments:**
- `--model_file`: Path to model `.safetensors` file
- `--config`: Path to `config.json` (optional)
- `--txt_path`: Input text file with speaker-labeled dialog
- `--speaker_names`: Speaker name(s) for voice file mapping
- `--output_dir`: Output directory for generated audio
- `--device`: `cuda`, `mps`, or `cpu` (auto-detected)
- `--dtype`: `float8_e4m3fn` or `bfloat16`
- `--cfg_scale`: Classifier-Free Guidance scale (default: 1.3)
- `--seed`: Random seed for reproducibility

### Configuration

#### Backend Configuration

Environment variables (optional):
```bash
export WORKSPACE_DIR=/path/to/workspace  # Default: ./workspace
export FLASK_DEBUG=False  # Production mode
```

#### Frontend Configuration

Development API URL (`frontend/.env.local`):
```bash
NEXT_PUBLIC_API_URL=http://localhost:9527/api/v1
```

---

## Documentation

### Architecture Overview

```
vibevoice/
├── backend/                 # Flask API server
│   ├── api/                # REST API endpoints
│   │   ├── projects.py     # Project CRUD
│   │   ├── speakers.py     # Speaker management
│   │   ├── dialog_sessions.py  # Dialog CRUD
│   │   ├── generation.py   # Voice generation
│   │   ├── dataset.py      # Dataset management
│   │   └── training.py     # LoRA training
│   ├── services/           # Business logic layer
│   ├── models/             # Data models
│   ├── task_manager/       # Background task queue
│   ├── inference/          # Inference engine
│   ├── training/           # Training engine & state management
│   ├── i18n/              # Backend translations
│   └── dist/              # Frontend static files (production)
├── frontend/               # Next.js web application
│   ├── app/               # Next.js pages
│   │   ├── page.tsx       # Home/Project selector
│   │   ├── quick-generate/ # Quick generation (no project)
│   │   ├── speaker-role/  # Speaker management
│   │   ├── voice-editor/  # Dialog editor
│   │   ├── generate-voice/ # Generation page
│   │   ├── dataset/       # Dataset management
│   │   └── fine-tuning/   # LoRA training page
│   ├── components/        # React components
│   ├── lib/              # Context providers & utilities
│   │   ├── ProjectContext.tsx
│   │   ├── SessionContext.tsx
│   │   ├── SpeakerRoleContext.tsx
│   │   ├── GenerationContext.tsx
│   │   ├── TrainingContext.tsx
│   │   ├── GlobalTaskContext.tsx
│   │   ├── i18n/         # Frontend translations
│   │   └── api.ts        # API client
│   └── types/            # TypeScript type definitions
└── vibevoice/            # Core inference library
    ├── modular/          # Model implementations
    │   ├── custom_offloading_utils.py  # Layer offloading
    │   └── adaptive_offload.py         # Auto VRAM config
    ├── processor/        # Input processing
    └── schedule/         # Diffusion scheduling
```

### API Reference

For complete API documentation including request/response examples, see [docs/APIs.md](docs/APIs.md).

### Workspace Structure

```
workspace/
├── projects.json          # All projects metadata
├── _quick-generate/       # Quick generation storage
│   ├── voices/            # Uploaded voice samples
│   ├── outputs/           # Generated audio files
│   └── history.json       # Generation history
└── {project-id}/
    ├── voices/
    │   ├── speakers.json  # Speaker metadata
    │   └── {uuid}.wav     # Voice files
    ├── scripts/
    │   ├── sessions.json  # Session metadata
    │   └── {uuid}.txt     # Dialog text files
    ├── output/
    │   ├── generation.json  # Generation metadata
    │   └── {request_id}.wav # Generated audio files
    ├── datasets/
    │   ├── datasets.json    # Dataset metadata
    │   └── {dataset-id}/
    │       ├── datasets.jsonl  # Dataset items (one JSON per line)
    │       ├── audio/          # Audio files
    │       └── voice_prompts/  # Voice prompt files
    └── training/
        ├── training_history.json  # Training job metadata
        └── lora_output/
            └── {lora-name}/
                ├── model_epoch_*.safetensors  # Checkpoint files
                └── model_final.safetensors    # Final model
```

### Performance Benchmarks

**RTX 4090 (24GB VRAM):**
| Configuration | VRAM | Generation Time | RTF | Quality |
|--------------|------|-----------------|-----|---------|
| BFloat16, No offload | 14GB | 15s (50s audio) | 0.30x | Excellent |
| Float8, No offload | 7GB | 16s (50s audio) | 0.32x | Excellent |

**RTX 3060 12GB:**
| Configuration | VRAM | Generation Time | RTF | Quality |
|--------------|------|-----------------|-----|---------|
| Float8, Balanced | 7GB | 30s (50s audio) | 0.60x | Excellent |
| Float8, Aggressive | 6GB | 40s (50s audio) | 0.80x | Good |

*RTF (Real-Time Factor) < 1.0 means faster than real-time*

---

## Community

### Getting Help

- **Issues**: [GitHub Issues](https://github.com/zhao-kun/vibevoice/issues) - Bug reports and feature requests
- **Discussions**: [GitHub Discussions](https://github.com/zhao-kun/vibevoice/discussions) - Questions and community support

### Showcase

Share your projects and experiences:
- **Demo Audio**: Submit your generated samples to the showcase
- **Use Cases**: Share how you're using VibeVoice
- **Improvements**: Contribute optimizations and enhancements

### Responsible AI

**Important**: This project is for **research and development** purposes only.

#### Risks
- **Deepfakes & Impersonation**: Synthetic speech can be misused for fraud or disinformation
- **Voice Cloning Ethics**: Always obtain explicit consent before cloning voices
- **Biases**: Model may inherit biases from training data
- **Unexpected Outputs**: Generated audio may contain artifacts or inaccuracies

#### Guidelines

**DO:**

- Clearly disclose when audio is AI-generated
- Obtain explicit consent for voice cloning
- Use responsibly for legitimate purposes
- Respect privacy and intellectual property
- Follow all applicable laws and regulations

**DO NOT:**

- Create deepfakes or impersonation without consent
- Spread disinformation or misleading content
- Use for fraud, scams, or malicious purposes
- Violate laws or ethical guidelines

**By using this software, you agree to use it ethically and responsibly.**

---

## Contributing

We welcome contributions from the community! Here's how you can help:

### Ways to Contribute

1. **Report Bugs**: Open an issue with detailed reproduction steps
2. **Suggest Features**: Propose new features via GitHub issues
3. **Submit Pull Requests**:
   - Fix bugs
   - Add features
   - Improve documentation
   - Add translations
4. **Improve Documentation**: Help make the project more accessible
5. **Share Use Cases**: Show how you're using VibeVoice

### Testing

```bash
# Backend tests (when available)
pytest tests/

# Frontend tests (when available)
cd frontend
npm test

# Manual testing
# 1. Create project
# 2. Add speakers
# 3. Create dialog
# 4. Generate voice
# 5. Verify output quality
```

---

## License

This project follows the same license terms as the original Microsoft VibeVoice repository. Please refer to the [LICENSE](LICENSE) file for details.

### Third-Party Licenses

- **Frontend**: React, Next.js, Tailwind CSS (MIT License)
- **Backend**: Flask, PyTorch (Various open-source licenses)
- **Model Weights**: Microsoft VibeVoice (subject to Microsoft's terms)

---

## Acknowledgments

- **Microsoft Research**: Original VibeVoice model and architecture
- **ComfyUI**: Float8 casting techniques inspiration
- **kohya-ss/musubi-tuner**: Offloading implementation and LoRA network reference
- **[voicepowered-ai/VibeVoice-finetuning](https://github.com/voicepowered-ai/VibeVoice-finetuning)**: Training dataloader implementation
- **HuggingFace**: Model hosting and distribution
- **Open Source Community**: Libraries and frameworks that made this possible

---

## Citation

If you use this implementation in your research, please cite both this project and the original VibeVoice paper:

```bibtex
@software{vibevoice_webapp_2024,
  title={VibeVoice: Complete Web Application for Multi-Speaker Voice Generation},
  author={Zhao, Kun},
  year={2024},
  url={https://github.com/zhao-kun/vibevoice}
}

@article{vibevoice2024,
  title={VibeVoice: Unified Autoregressive and Diffusion for Speech Generation},
  author={Microsoft Research},
  year={2024}
}
```

---

## Troubleshooting

### CUDA Out of Memory
```bash
# Try Float8 model
--dtype float8_e4m3fn

# Enable layer offloading in web UI
# Or use CLI with manual configuration
```

### Audio Quality Issues
```bash
# Adjust CFG scale (try 1.0 - 2.0)
--cfg_scale 1.5

# Use higher precision model
--dtype bfloat16
```

### Port Already in Use
```bash
# Change port in backend/run.py
app.run(host='0.0.0.0', port=9528)
```

### Frontend Build Errors
```bash
cd frontend
rm -rf node_modules .next
npm install
npm run build
```

---

<div align="center">

**Made by the VibeVoice Community**

[Back to Top](#vibevoice)

</div>


================================================
FILE: README_zh.md
================================================
# VibeVoiceFusion

<div align="center">

<img src="frontend/public/icon-rect-pulse.svg" alt="VibeVoiceFusion Logo" width="120"/>

**完整的多说话人语音生成 Web 应用**

*基于 Microsoft VibeVoice 模型构建*

[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
[![Python](https://img.shields.io/badge/python-3.9+-blue.svg?logo=python)](https://www.python.org/)
[![TypeScript](https://img.shields.io/badge/typescript-5.0+-blue.svg?logo=typescript)](https://www.typescriptlang.org/)
[![Docker](https://img.shields.io/badge/docker-ready-brightgreen.svg)](Dockerfile)
[![Docker Hub](https://img.shields.io/badge/Docker%20Hub-vibevoicefusion-blue?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)
[![Docker Pulls](https://img.shields.io/docker/pulls/zhaokundev/vibevoicefusion?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)
[![Image Size](https://img.shields.io/docker/image-size/zhaokundev/vibevoicefusion/latest?logo=docker)](https://hub.docker.com/r/zhaokundev/vibevoicefusion)

[English](README.md) | [简体中文](README_zh.md)

[功能特性](#功能特性) • [演示样本](#演示样本) • [快速开始](#快速开始) • [文档](#文档) • [社区](#社区) • [贡献](#贡献)

</div>

---

## 概述

### 项目目的

VibeVoiceFusion 是一个**Web 应用**,用于生成高质量、多说话人的合成语音,具备声音克隆功能。基于微软的 VibeVoice 模型(AR + 扩散架构),本项目提供完整的全栈解决方案,包含语音生成、LoRA 微调、数据集管理、批量生成和先进的显存优化功能。

**核心目标:**

- 提供无需编程知识的友好界面进行语音生成
- 支持高效的多说话人对话合成,保持不同说话人的独特声音特征
- 支持 LoRA 微调,实现自定义声音适配和风格迁移
- 批量生成多个音频变体,使用不同的随机种子
- 优化显存使用,支持消费级 GPU(10GB+ 显存)
- 支持双语工作流(英语/中文)
- 提供 Web 界面和命令行界面以适应不同使用场景

<div align="center">
  <a href="https://youtu.be/J9pmcOBWN4c" target="_blank">
    <img src="docs/images/VibevoiceFusion.png" alt="Video Introduction" width="700"/>
  </a>
</div>

### 技术原理

VibeVoice 结合**自回归(AR)**和**扩散**技术进行文本转语音合成:

1. **文本处理**:输入文本经过分词并通过基于 Qwen 的语言模型主干网络处理
2. **声音编码**:参考语音样本被编码为声学和语义嵌入
3. **AR 生成**:模型基于文本和声音嵌入自回归生成语音 token
4. **扩散细化**:基于 DPM-Solver 的扩散头将 token 转换为高质量音频波形
5. **声音克隆**:统一处理器从参考音频样本中保留说话人特征

**技术亮点:**

- **模型架构**:Qwen 主干网络 + VAE 声学分词器 + 语义编码器 + 扩散头
- **量化技术**:Float8 (FP8 E4M3FN) 支持,显存减少约 50%,质量损失极小
- **层卸载**:动态 CPU/GPU 内存管理,可在有限显存上运行
- **注意力机制**:PyTorch 原生 SDPA,最大化兼容性

### 功能特性

#### 快速生成

- **一键生成**:无需创建项目、说话人或会话即可生成语音
- **音源选项**:
  - 上传自定义音频文件(WAV、MP3、M4A、FLAC、WebM)- 最多 4 个文件
  - 从预设音色样本中选择,支持语言/性别筛选
- **自动模式检测**:自动识别对话格式与旁白格式
- **多音色支持**:支持使用最多 4 个音色提示进行生成
- **生成历史**:持久化历史记录,支持展开详情和批量删除
- **逐项进度**:实时跟踪每个生成语音的进度

#### 完整的 Web 应用

- **项目管理**:使用元数据和描述组织语音生成项目
- **说话人/声音管理**:
  - 上传和管理参考语音样本(WAV、MP3、M4A、FLAC、WebM)
  - 音频预览与播放控制
  - 声音文件替换,自动缓存清除
  - 音频裁剪功能
- **对话编辑器**:
  - 可视化编辑器,支持拖拽重排对话行
  - 文本编辑模式用于批量编辑
  - 支持多说话人对话(最多 4+ 个说话人)
  - **旁白模式**支持单说话人内容(有声书、文章、播客)
  - 实时预览和验证
- **生成系统**:
  - 基于队列的任务管理(防止 GPU 冲突)
  - 实时进度监控与动态更新
  - 可配置参数(CFG scale、随机种子、模型精度)
  - **批量生成**:单次生成 2-20 个音频变体,使用不同随机种子
  - LoRA 模型支持,可配置权重 (0-1]
  - 生成历史记录,支持过滤、排序和分页
  - 完成的生成可播放和下载

#### LoRA 微调

- **数据集管理**:
  - 创建和管理包含音频/文本对的训练数据集
  - 支持从 ZIP 压缩包或本地文件夹导入数据集
  - JSONL 格式高效处理数据
  - 大型数据集支持分页和搜索
  - 导出数据集用于备份或分享

- **训练系统**:
  - LoRA(低秩适应)微调,实现声音定制
  - 可配置训练参数(训练轮数、学习率、LoRA 秩、批量大小)
  - 支持层卸载,可在消费级 GPU 上训练
  - 实时训练进度,带 tqdm 风格进度条
  - 实时训练指标图表(损失、学习率、时间)
  - TensorBoard 集成,查看详细指标
  - 训练历史记录,状态跟踪(准备中、训练中、已完成、失败)
  - OOM 检测,提供恢复建议

- **LoRA 模型使用**:
  - 在语音生成时选择已训练的 LoRA 模型
  - 可配置 LoRA 权重,与基础模型混合
  - 每个训练任务支持多个 LoRA 文件(epoch 检查点 + 最终模型)

#### 显存优化

- **层卸载**:在 CPU/GPU 之间移动 Transformer 层以减少显存需求
  - **平衡模式** (12 GPU / 16 CPU 层):约 5GB 显存节省,约 2.0 倍慢 - RTX 3060 16GB、4070
  - **激进模式** (8 GPU / 20 CPU 层):约 6GB 显存节省,约 2.5 倍慢 - RTX 3080 12GB、4060
  - **极限模式** (4 GPU / 24 CPU 层):约 7GB 显存节省,约 3.5 倍慢 - RTX 3080 10GB(最低配置)
- **Float8 量化**:将模型大小从约 14GB 减少到约 7GB,质量相当 (RTX 40系及以上显卡支持)
- **自适应配置**:自动显存估算和最优层分配

**显存需求:**

| 配置 | GPU 层数 | 显存占用 | 速度 | 目标硬件 |
|------|---------|---------|------|---------|
| 无卸载 | 28 | 11-14GB | 1.0x | RTX 4090、A100、3090 |
| 平衡 | 12 | 6-8GB | 0.70x | RTX 4070、3080 12GB |
| 激进 | 8 | 5-7GB | 0.55x | RTX 3060 12GB |
| 极限 | 4 | 4-5GB | 0.40x | RTX 3080 10GB |

> Float8 量化,只有RTX 40系及以上显卡支持。

#### 国际化

- **完整双语支持**:完整的英文/中文界面,360+ 翻译键
- **自动检测**:首次访问自动检测浏览器语言
- **持久化偏好**:语言选择保存在 localStorage
- **后端国际化**:API 错误消息和响应翻译为用户语言

#### Docker 部署

- **多阶段构建**:优化的 Dockerfile,包含前端构建、Python venv 和模型下载
- **自包含**:从 GitHub 克隆并完全从源代码构建
- **HuggingFace 集成**:构建过程中自动下载模型文件(约 3-4GB)

#### 其他功能

- **响应式设计**:使用 Tailwind CSS 的移动友好界面
- **实时更新**:无需 WebSocket 的智能轮询更新间隔(活动时 2 秒,后台 60 秒)
- **音频缓存清除**:确保音频更新立即反映
- **Toast 通知**:所有操作的用户友好反馈
- **深色模式就绪**:现代化一致的样式
- **可访问性**:键盘导航和 ARIA 标签

---

## 演示样本

试听使用 VibeVoiceFusion 生成的语音样本。点击下方链接下载并播放:

### 单说话人

**🎧 [大话西游(BFloat16 模型)](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/1p_pandora_box_bf16.wav)**

*使用 bfloat16 精度模型生成 - 完整质量,需要 14GB 显存*

**🎧 [大话西游(Float8 模型)](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/1p_pandora_box_float8_e4m3fn.wav)**

*使用 float8 量化生成 - 优化至 7lGB 显存,质量相当, RTX 40系及以上显卡支持*

### 多说话人(3 个说话人)

**🎭 [东邪西毒 - 西游版](https://raw.githubusercontent.com/zhao-kun/VibeVoiceFusion/main/demo/outputs/东邪西毒-西游版.wav)**

*多说话人对话,每个角色具有独特的声音特征*

---

## 快速开始

### 前置要求

- **Python**:3.9 或更高版本
- **Node.js**:16.x 或更高版本(用于前端开发)
- **CUDA**:支持 CUDA 的 GPU(推荐)
- **显存**:极限卸载最少 6GB,推荐 14GB 以获得最佳性能
- **Docker**:可选,用于容器化部署

### 安装

#### 方式 1:Docker(生产环境推荐)

构建Docker镜像
```bash
# 克隆仓库
git clone https://github.com/zhao-kun/vibevoicefusion.git
cd vibevoicefusion

# 使用 Docker Compose 构建和运行
docker compose build vibevoice
```

运行Docker 容器

```bash
docker run -d \
  --name vibevoice \
  --gpus all \
  -p 9527:9527 \
  -v $(pwd)/workspace:/workspace/zhao-kun/vibevoice/workspace \
  zhaokundev/vibevoicefusion:latest
```

在 `http://localhost:9527` 访问应用

**Docker 镜像已经上传至Docker Hub, 你可以通过如下的命令直接启动vibevoicefusion 服务。**

```bash
docker pull zhaokundev/vibevoicefusion
docker run -d \
  --name vibevoicefusion \
  --gpus all \
  -p 9527:9527 \
  -v $(pwd)/workspace:/workspace/zhao-kun/vibevoice/workspace \
  zhaokundev/vibevoicefusion:latest
```

**构建时间**:18-28 分钟 | **镜像大小**:约 12-15GB

#### 方式 2:手动安装

**1. 安装后端依赖**

```bash
# 克隆仓库
git clone https://github.com/zhao-kun/vibevoice.git
cd vibevoice

# 安装 Python 包
pip install -e .
```

**2. 下载预训练模型**

从 HuggingFace 下载(选择其一):

- **Float8(推荐)**:[vibevoice7b_float8_e4m3fn.safetensors](https://huggingface.co/zhaokun/vibevoice-large/blob/main/vibevoice7b_float8_e4m3fn.safetensors)(约 7GB)
- **BFloat16(全精度)**:[vibevoice7b_bf16.safetensors](https://huggingface.co/zhaokun/vibevoice-large/blob/main/vibevoice7b_bf16.safetensors)(约 14GB)
- **配置文件**:[config.json](https://huggingface.co/zhaokun/vibevoice-large/blob/main/config.json)

将文件放置在 `./models/vibevoice/` 目录

**3. 安装前端依赖**(用于开发)

```bash
cd frontend
npm install
```

**4. 构建前端**(用于生产)

```bash
cd frontend
npm run build
cp -r out/* ../backend/dist/
```

### 使用方法

#### Web 应用(推荐)

**生产模式**(单服务器):

```bash
# 启动后端服务器(同时提供 API 和前端)
python backend/run.py

# 在 http://localhost:9527 访问
```

**开发模式**(分离服务器):

```bash
# 终端 1:启动后端 API
python backend/run.py  # http://localhost:9527

# 终端 2:启动前端开发服务器
cd frontend
npm run dev  # http://localhost:3000
```

### 快速生成(无需项目)

快速测试,无需设置项目:

1. 从主页点击 **"快速生成"**
2. **选择音源**:
   - **上传**:拖放音频文件(最多支持 4 个文件)
   - **预设**:从预设音色中选择,支持语言/性别筛选
3. **输入文本**:输入对话格式(`Speaker 1: 你好`)或旁白格式(纯文本)
4. **配置参数**:设置种子、批量大小(2-20)和卸载选项
5. **生成**:点击生成并监控每项进度
6. **下载**:播放或下载生成的音频

**要点:**
- 自动检测对话模式与旁白模式
- 对话模式下所有说话人使用相同音色
- 不支持 LoRA(仅基础模型)
- 历史记录跨会话持久保存

---

### 完整工作流程指南

本指南将带您完成从头到尾创建多说话人语音生成的完整过程。

#### 步骤 1:创建项目

首先创建一个新项目或选择现有项目。项目帮助您使用元数据和描述组织语音生成工作。

<div align="center">
<img src="docs/images/home-zh.png" alt="项目管理" width="700"/>
<p><i>从主页创建和管理项目</i></p>
</div>

**操作步骤:**
- 点击"创建新项目"卡片
- 输入项目名称(例如:"播客第 1 集")
- 可选择添加描述
- 点击"创建项目"

项目将自动被选中,并导航到说话人角色页面。

#### 步骤 2:添加说话人并上传音色样本

为每个说话人上传参考音频样本。系统支持多种音频格式(WAV、MP3、M4A、FLAC、WebM)。

<div align="center">
<img src="docs/images/speaker-role-zh.png" alt="说话人管理" width="700"/>
<p><i>为每个说话人上传和管理音色样本</i></p>
</div>

**操作步骤:**
- 点击"添加新说话人"按钮
- 说话人将被自动命名(例如:"Speaker 1"、"Speaker 2")
- 点击"上传音色"选择参考音频文件(建议 3-30 秒)
- 使用音频播放器预览已上传的音色
- 重复以上步骤添加更多说话人(支持 2-4+ 个说话人)

**提示:**
- 使用背景噪音最小的清晰音频
- 5-15 秒的语音最适合音色克隆
- 每个说话人需要独特的音色样本
- 稍后可以通过点击"更改音色"替换音频文件

#### 步骤 3:创建和编辑对话

创建对话会话并撰写多说话人对话。对话编辑器支持拖放重排和实时预览。

<div align="center">
<img src="docs/images/voice-editor-zh.png" alt="对话编辑器" width="700"/>
<p><i>支持可视化和文本模式的多说话人对话编辑器</i></p>
</div>

**操作步骤:**
- 在会话列表中点击"创建新会话"
- 输入会话名称(例如:"第 1 章")
- 在对话编辑器中为每个说话人添加对话行:
  - 从下拉菜单中选择说话人
  - 输入对话文本
  - 点击"添加行"或按回车键
- 通过拖动句柄图标重新排序对话行
- 使用"文本编辑器"模式进行批量编辑
- 点击"保存"保存更改

**对话格式(文本模式):**
```
Speaker 1: 欢迎来到我们的播客!

Speaker 2: 谢谢邀请。很高兴来到这里。

Speaker 1: 让我们深入探讨今天的话题。
```

**旁白模式:**

对于有声书、文章、播客等单说话人内容,可使用**旁白模式**:

1. 创建新会话时,切换到"旁白"模式
2. 从已上传的说话人中选择旁白音色
3. 输入纯文本,无需 `Speaker N:` 前缀
4. 选定的旁白音色将朗读所有文本

```
这是旁白的第一段内容。

这是第二段。无需说话人格式。

您选择的旁白音色将朗读所有文本。
```

**功能特性:**
- 支持拖放的可视化编辑器
- 批量编辑的文本编辑器
- 实时预览
- 复制和下载功能
- 格式验证
- **旁白模式**支持单说话人内容

#### 步骤 4:生成语音

配置生成参数并启动语音合成过程。监控实时进度并管理生成历史。

<div align="center">
<img src="docs/images/generate-voice-zh.png" alt="语音生成" width="700"/>
<p><i>带有参数配置、实时进度和历史记录的生成界面</i></p>
</div>

**操作步骤:**
- 导航到"生成语音"页面
- 从下拉菜单中选择对话会话
- 配置参数:
  - **模型类型**:
    - `float8_e4m3fn`(推荐):7GB 显存,加载更快
    - `bfloat16`:14GB 显存,完整精度
  - **CFG Scale**(1.0-2.0):控制生成对文本的遵循程度
    - 较低(1.0-1.3):更自然、多变
    - 较高(1.5-2.0):更可控,可能听起来机械
    - 默认值:1.3
  - **随机种子**:任意正整数,用于可重现性
  - **卸载**(可选):如果显存 < 14GB 则启用
    - **平衡**:12 GPU 层,节省约 5GB,慢 2.0 倍(RTX 3060 16GB、4070)
    - **激进**:8 GPU 层,节省约 6GB,慢 2.5 倍(RTX 3060 12GB)
    - **极端**:4 GPU 层,节省约 7GB,慢 3.5 倍(最低 10GB 显存)
- 点击"开始生成"

**实时监控:**
- 进度条显示完成百分比
- 阶段指示器:预处理 → 推理 → 保存
- 实时 token 生成计数
- 估计剩余时间

<div align="center">
<img src="docs/images/generating-voice-zh.png" alt="语音生成中" width="700"/>
<p><i>带有参数配置、实时进度和历史记录的生成界面</i></p>
</div>

**生成历史:**
- 查看所有过往生成及其状态(已完成、失败、运行中)
- 按日期、状态或会话筛选和排序
- 在线播放生成的音频
- 下载 WAV 文件
- 删除不需要的生成
- 查看详细指标(token、时长、RTF、显存使用)

#### 命令行界面

不使用 Web UI 的命令行生成:

```bash
python demo/local_file_inference.py \
    --model_file ./models/vibevoice/vibevoice7b_float8_e4m3fn.safetensors \
    --txt_path demo/text_examples/1p_pandora_box.txt \
    --speaker_names zh-007 \
    --output_dir ./outputs \
    --dtype float8_e4m3fn \
    --cfg_scale 1.3 \
    --seed 42
```

**CLI 参数:**

- `--model_file`:模型 `.safetensors` 文件路径
- `--config`:`config.json` 路径(可选)
- `--txt_path`:带说话人标签的对话文本文件
- `--speaker_names`:语音文件映射的说话人名称
- `--output_dir`:生成音频的输出目录
- `--device`:`cuda`、`mps` 或 `cpu`(自动检测)
- `--dtype`:`float8_e4m3fn` 或 `bfloat16`
- `--cfg_scale`:分类器自由引导比例(默认:1.3)
- `--seed`:可重现性的随机种子

### 配置

#### 后端配置

环境变量(可选):

```bash
export WORKSPACE_DIR=/path/to/workspace  # 默认:./workspace
export FLASK_DEBUG=False  # 生产模式
```

#### 前端配置

开发 API URL(`frontend/.env.local`):

```bash
NEXT_PUBLIC_API_URL=http://localhost:9527/api/v1
```

---

## 文档

### 架构概览

```
vibevoice/
├── backend/                 # Flask API 服务器
│   ├── api/                # REST API 端点
│   │   ├── projects.py     # 项目 CRUD
│   │   ├── speakers.py     # 说话人管理
│   │   ├── dialog_sessions.py  # 对话 CRUD
│   │   ├── generation.py   # 语音生成
│   │   ├── dataset.py      # 数据集管理
│   │   └── training.py     # LoRA 训练
│   ├── services/           # 业务逻辑层
│   ├── models/             # 数据模型
│   ├── task_manager/       # 后台任务队列
│   ├── inference/          # 推理引擎
│   ├── training/           # 训练引擎和状态管理
│   ├── i18n/              # 后端翻译
│   └── dist/              # 前端静态文件(生产)
├── frontend/               # Next.js Web 应用
│   ├── app/               # Next.js 页面
│   │   ├── page.tsx       # 主页/项目选择器
│   │   ├── quick-generate/ # 快速生成(无需项目)
│   │   ├── speaker-role/  # 说话人管理
│   │   ├── voice-editor/  # 对话编辑器
│   │   ├── generate-voice/ # 生成页面
│   │   ├── dataset/       # 数据集管理
│   │   └── fine-tuning/   # LoRA 训练页面
│   ├── components/        # React 组件
│   ├── lib/              # Context providers 和工具
│   │   ├── ProjectContext.tsx
│   │   ├── SessionContext.tsx
│   │   ├── SpeakerRoleContext.tsx
│   │   ├── GenerationContext.tsx
│   │   ├── TrainingContext.tsx
│   │   ├── GlobalTaskContext.tsx
│   │   ├── i18n/         # 前端翻译
│   │   └── api.ts        # API 客户端
│   └── types/            # TypeScript 类型定义
└── vibevoice/            # 核心推理库
    ├── modular/          # 模型实现
    │   ├── custom_offloading_utils.py  # 层卸载
    │   └── adaptive_offload.py         # 自动显存配置
    ├── processor/        # 输入处理
    └── schedule/         # 扩散调度
```

### API 参考

完整的 API 文档(包含请求/响应示例),请参阅 [docs/APIs.md](docs/APIs.md)。

### 工作空间结构

```
workspace/
├── projects.json          # 所有项目元数据
├── _quick-generate/       # 快速生成存储
│   ├── voices/            # 上传的音色样本
│   ├── outputs/           # 生成的音频文件
│   └── history.json       # 生成历史
└── {project-id}/
    ├── voices/
    │   ├── speakers.json  # 说话人元数据
    │   └── {uuid}.wav     # 语音文件
    ├── scripts/
    │   ├── sessions.json  # 会话元数据
    │   └── {uuid}.txt     # 对话文本文件
    ├── output/
    │   ├── generation.json  # 生成元数据
    │   └── {request_id}.wav # 生成的音频文件
    ├── datasets/
    │   ├── datasets.json    # 数据集元数据
    │   └── {dataset-id}/
    │       ├── datasets.jsonl  # 数据集条目(每行一个 JSON)
    │       ├── audio/          # 音频文件
    │       └── voice_prompts/  # 语音提示文件
    └── training/
        ├── training_history.json  # 训练任务元数据
        └── lora_output/
            └── {lora-name}/
                ├── model_epoch_*.safetensors  # 检查点文件
                └── model_final.safetensors    # 最终模型
```

### 性能基准

**RTX 4090 (24GB 显存):**

| 配置 | 显存 | 生成时间 | RTF | 质量 |
|------|------|---------|-----|------|
| BFloat16,无卸载 | 14GB | 15秒(50秒音频)| 0.30x | 优秀 |
| Float8,无卸载 | 7GB | 16秒(50秒音频)| 0.32x | 优秀 |

**RTX 3060 12GB:**

| 配置 | 显存 | 生成时间 | RTF | 质量 |
|------|------|---------|-----|------|
| Float8,平衡 | 7GB | 30秒(50秒音频)| 0.60x | 优秀 |
| Float8,激进 | 6GB | 40秒(50秒音频)| 0.80x | 良好 |

*RTF(实时因子)< 1.0 表示快于实时生成*

---

## 社区

### 获取帮助

- **Issues**:[GitHub Issues](https://github.com/zhao-kun/vibevoice/issues) - 错误报告和功能请求
- **Discussions**:[GitHub Discussions](https://github.com/zhao-kun/vibevoice/discussions) - 问题和社区支持

### 展示

分享您的项目和经验:

- **演示音频**:提交您的生成样本到展示区
- **使用案例**:分享您如何使用 VibeVoice
- **改进建议**:贡献优化和增强功能

### 负责任的 AI

**重要**:本项目仅用于**研究和开发**目的。

#### 风险

- **深度伪造与冒充**:合成语音可能被滥用于欺诈或虚假信息
- **声音克隆伦理**:克隆声音前务必获得明确同意
- **偏见**:模型可能继承训练数据中的偏见
- **意外输出**:生成的音频可能包含瑕疵或不准确之处

#### 指南

**应该做:**

- 明确披露音频是 AI 生成的
- 获得声音克隆的明确同意
- 负责任地用于合法目的
- 尊重隐私和知识产权
- 遵守所有适用的法律法规

**不应该做:**

- 未经同意创建深度伪造或冒充
- 传播虚假信息或误导性内容
- 用于欺诈、诈骗或恶意目的
- 违反法律或道德准则

**使用本软件即表示您同意以道德和负责任的方式使用它。**

---

## 贡献

我们欢迎社区贡献!以下是您可以帮助的方式:

### 贡献方式

1. **报告错误**:使用详细的重现步骤开启 issue
2. **建议功能**:通过 GitHub issues 提出新功能
3. **提交 Pull Request**:
   - 修复错误
   - 添加功能
   - 改进文档
   - 添加翻译
4. **改进文档**:帮助使项目更易访问
5. **分享用例**:展示您如何使用 VibeVoice

### 测试

```bash
# 后端测试(可用时)
pytest tests/

# 前端测试(可用时)
cd frontend
npm test

# 手动测试
# 1. 创建项目
# 2. 添加说话人
# 3. 创建对话
# 4. 生成语音
# 5. 验证输出质量
```

---

## 许可证

本项目遵循与原始 Microsoft VibeVoice 仓库相同的许可条款。详情请参阅 [LICENSE](LICENSE) 文件。

### 第三方许可证

- **前端**:React、Next.js、Tailwind CSS(MIT 许可证)
- **后端**:Flask、PyTorch(各种开源许可证)
- **模型权重**:Microsoft VibeVoice(受 Microsoft 条款约束)

---

## 致谢

- **Microsoft Research**:原始 VibeVoice 模型和架构
- **ComfyUI**:Float8 转换技术灵感
- **kohya-ss/musubi-tuner**: Offloading 和 LoRA 网络实现参考
- **[voicepowered-ai/VibeVoice-finetuning](https://github.com/voicepowered-ai/VibeVoice-finetuning)**:训练数据加载器实现
- **HuggingFace**:模型托管和分发
- **开源社区**:使本项目成为可能的各种库和框架

---

## 引用

如果您在研究中使用本实现,请同时引用本项目和原始 VibeVoice 论文:

```bibtex
@software{vibevoice_webapp_2024,
  title={VibeVoice: Complete Web Application for Multi-Speaker Voice Generation},
  author={Zhao, Kun},
  year={2024},
  url={https://github.com/zhao-kun/vibevoice}
}

@article{vibevoice2024,
  title={VibeVoice: Unified Autoregressive and Diffusion for Speech Generation},
  author={Microsoft Research},
  year={2024}
}
```

---

## 故障排除

### CUDA 内存不足

```bash
# 尝试 Float8 模型
--dtype float8_e4m3fn

# 在 Web UI 中启用层卸载
# 或在 CLI 中使用手动配置
```

### 音频质量问题

```bash
# 调整 CFG scale(尝试 1.0 - 2.0)
--cfg_scale 1.5

# 使用更高精度模型
--dtype bfloat16
```

### 端口已被占用

```bash
# 在 backend/run.py 中更改端口
app.run(host='0.0.0.0', port=9528)
```

### 前端构建错误

```bash
cd frontend
rm -rf node_modules .next
npm install
npm run build
```

---

<div align="center">

**VibeVoice 社区制作**

[返回顶部](#vibevoice)

</div>


================================================
FILE: backend/.gitignore
================================================
.env


================================================
FILE: backend/README.md
================================================
# VibeVoice Backend

Flask-based REST API backend for VibeVoice speech generation system.

## Architecture

```
backend/
├── api/                    # API endpoints (blueprints)
│   ├── __init__.py        # Main API blueprint
│   ├── projects.py        # Project management endpoints
│   ├── speakers.py        # Speaker role endpoints
│   ├── dialog_sessions.py # Dialog session endpoints
│   └── generation.py      # Voice generation endpoints (TBD)
├── models/                 # Data models
│   ├── project.py         # Project dataclass
│   ├── speaker.py         # SpeakerRole dataclass
│   └── dialog_session.py  # DialogSession dataclass
├── services/              # Business logic services
│   ├── project_service.py
│   ├── speaker_service.py
│   └── dialog_session_service.py
├── utils/                 # Utility functions
│   ├── file_handler.py
│   └── dialog_validator.py
├── app.py                 # Flask application factory
├── config.py              # Configuration management
├── run.py                 # Development server
└── .env.example          # Environment variables template
```

## Setup

### Install Dependencies

The backend shares dependencies with the main VibeVoice project. Install from the project root:

```bash
pip install -e .
```

### Configuration

1. Copy the example environment file:
```bash
cp backend/.env.example backend/.env
```

2. Edit `backend/.env` with your configuration

### Running the Development Server

From the project root:

```bash
python backend/run.py
```

Or using the module:

```bash
python -m backend.run
```

The server will start at `http://localhost:9527` by default.

## API Endpoints

### Health Check

```
GET /health
```

Returns server health status.

### API Base

```
GET /
```

Returns API information.

### Test Endpoint

```
GET /api/v1/ping
```

Simple ping endpoint for testing.

### Projects API

#### List Projects
```
GET /api/v1/projects
```

Returns all projects with metadata.

**Response:**
```json
{
  "projects": [
    {
      "id": "my-project",
      "name": "My Project",
      "description": "Project description",
      "created_at": "2025-10-22T03:18:58.969507",
      "updated_at": "2025-10-22T03:18:58.969507"
    }
  ],
  "count": 1
}
```

#### Get Project
```
GET /api/v1/projects/<project_id>
```

Get specific project by ID.

#### Create Project
```
POST /api/v1/projects
Content-Type: application/json

{
  "name": "Project Name",
  "description": "Optional description"
}
```

Creates a new project directory with subdirectories (`voices/`, `scripts/`, `outputs/`) and adds metadata entry.

**Response:** HTTP 201 with project data

#### Update Project
```
PUT /api/v1/projects/<project_id>
Content-Type: application/json

{
  "name": "Updated Name",
  "description": "Updated description"
}
```

Updates project metadata (name and/or description).

#### Delete Project
```
DELETE /api/v1/projects/<project_id>
```

Deletes project directory and removes from metadata.

**Response:**
```json
{
  "message": "Project deleted successfully",
  "project_id": "my-project"
}
```

## Configuration

Environment variables (see `.env.example`):

- `FLASK_ENV`: Environment (development/production/testing)
- `FLASK_HOST`: Server host (default: 0.0.0.0)
- `FLASK_PORT`: Server port (default: 9527)
- `FLASK_DEBUG`: Enable debug mode (default: true)
- `SECRET_KEY`: Flask secret key
- `CORS_ORIGINS`: Allowed CORS origins (comma-separated)
- `WORKSPACE_DIR`: Root directory for all projects (default: ./workspace)
- `MODEL_PATH`: Path to VibeVoice model
- `MODEL_DEVICE`: Device for model inference (cuda/cpu)
- `UPLOAD_FOLDER`: Directory for uploaded files
- `MAX_CONTENT_LENGTH`: Maximum file upload size

## Development

### Project Structure

- **api/**: REST API endpoints organized by resource
- **models/**: Data models and schemas
- **services/**: Business logic and model integration
- **utils/**: Helper functions and utilities

### Adding New Endpoints

1. Create a new file in `api/` (e.g., `api/myresource.py`)
2. Define routes using Flask blueprints
3. Import and register the blueprint in `api/__init__.py`

## Project Directory Structure

Each project is stored as a directory under `WORKSPACE_DIR`:

```
workspace/
├── projects.json              # Metadata for all projects
└── my-project/               # Project directory (named by project ID)
    ├── voices/               # Speaker voice samples
    │   ├── speakers.json    # Speaker metadata
    │   └── *.wav            # Voice sample files
    ├── scripts/              # Dialog scripts
    │   ├── sessions.json    # Dialog session metadata
    │   └── *.txt            # Dialog text files
    └── outputs/              # Generated audio files (TBD)
```

### Speakers API

#### List Speakers
```
GET /api/v1/projects/<project_id>/speakers
```

Returns all speaker roles for a project.

**Response:**
```json
{
  "speakers": [
    {
      "speaker_id": "Speaker 1",
      "name": "Alice",
      "description": "Main host",
      "voice_filename": "abc123.wav",
      "created_at": "2025-10-22T...",
      "updated_at": "2025-10-22T..."
    }
  ],
  "count": 1
}
```

#### Get Speaker
```
GET /api/v1/projects/<project_id>/speakers/<speaker_id>
```

Get specific speaker by ID (e.g., "Speaker 1").

#### Add Speaker
```
POST /api/v1/projects/<project_id>/speakers
Content-Type: multipart/form-data

name: Speaker Name
description: Speaker description (optional)
voice_file: <audio file> (.wav, .mp3, .m4a, .flac)
```

Creates a new speaker role with voice file upload. Speaker ID is automatically generated as "Speaker N" where N is sequential starting from 1.

**Response:** HTTP 201 with speaker data

#### Update Speaker
```
PUT /api/v1/projects/<project_id>/speakers/<speaker_id>
Content-Type: application/json

{
  "name": "Updated Name",
  "description": "Updated description"
}
```

Updates speaker metadata (name and/or description). Voice file cannot be changed after creation.

#### Delete Speaker
```
DELETE /api/v1/projects/<project_id>/speakers/<speaker_id>
```

Deletes speaker role and its voice file. Speaker IDs are automatically reindexed to maintain continuity (e.g., deleting "Speaker 1" renames "Speaker 2" to "Speaker 1").

**Response:**
```json
{
  "message": "Speaker deleted successfully. Speaker IDs have been reindexed.",
  "speaker_id": "Speaker 1"
}
```

#### Download Voice File
```
GET /api/v1/projects/<project_id>/speakers/<speaker_id>/voice
```

Download speaker's voice sample file.

### Dialog Sessions API

#### List Dialog Sessions
```
GET /api/v1/projects/<project_id>/sessions
```

Returns all dialog sessions for a project.

**Response:**
```json
{
  "sessions": [
    {
      "session_id": "uuid",
      "name": "Episode 1",
      "description": "First episode",
      "text_filename": "uuid.txt",
      "created_at": "2025-10-22T...",
      "updated_at": "2025-10-22T..."
    }
  ],
  "count": 1
}
```

#### Get Dialog Session
```
GET /api/v1/projects/<project_id>/sessions/<session_id>
```

Get specific dialog session by ID.

#### Create Dialog Session
```
POST /api/v1/projects/<project_id>/sessions
Content-Type: application/json

{
  "name": "Session Name",
  "description": "Session description",
  "dialog_text": "Speaker 1: Hello\n\nSpeaker 2: Hi there!"
}
```

Creates a new dialog session with text content. The dialog text must follow the format:
- `Speaker N: dialog text` (where N is a positive integer)
- Empty line between dialog entries
- Speaker IDs must exist in the speaker management system

The API validates:
1. Dialog text format (correct pattern)
2. Speaker IDs exist in the project

**Response:** HTTP 201 with session data

**Error Response (validation failed):**
```json
{
  "error": "Validation Error",
  "message": "Speaker ID validation failed: Invalid speaker IDs found: Speaker 99"
}
```

#### Update Dialog Session
```
PUT /api/v1/projects/<project_id>/sessions/<session_id>
Content-Type: application/json

{
  "name": "Updated Name",
  "description": "Updated description",
  "dialog_text": "Speaker 1: Updated content"
}
```

Updates session metadata and/or dialog text content. Same validation rules apply for dialog_text.

#### Delete Dialog Session
```
DELETE /api/v1/projects/<project_id>/sessions/<session_id>
```

Deletes dialog session and its text file.

**Response:**
```json
{
  "message": "Session deleted successfully",
  "session_id": "uuid"
}
```

#### Get Session Text
```
GET /api/v1/projects/<project_id>/sessions/<session_id>/text
```

Get dialog text content for a session.

**Response:**
```json
{
  "session_id": "uuid",
  "dialog_text": "Speaker 1: Hello\n\nSpeaker 2: Hi!"
}
```

#### Download Session Text File
```
GET /api/v1/projects/<project_id>/sessions/<session_id>/download
```

Download dialog text file.

## Dialog Text Format

Dialog sessions must follow this specific format (matching `demo/text_examples/*.txt`):

```
Speaker 1: First line of dialog from speaker 1

Speaker 2: Response from speaker 2

Speaker 1: Another line from speaker 1

Speaker 3: A third speaker joins
```

**Format rules:**
- Pattern: `Speaker N: dialog text` where N is a positive integer
- Empty line separates each dialog entry
- Speaker can appear multiple times in any order
- Speaker IDs must match existing speakers in the project
- Dialog text cannot be empty

**Example:**
```
Speaker 1: Hello and welcome to our show.

Speaker 2: Thanks for having me on today.

Speaker 1: Let's dive right in. What brings you here?

Speaker 2: I wanted to discuss our new project.
```

## Future Implementation

The following endpoints are planned for implementation:

- **Generation API**: Speech generation from dialog sessions

================================================
FILE: backend/__init__.py
================================================
"""
VibeVoice Backend API
Flask-based backend for VibeVoice speech generation
"""

__version__ = "0.0.1"


================================================
FILE: backend/api/__init__.py
================================================
"""
API Blueprint for VibeVoice backend
"""
from flask import Blueprint, jsonify

# Create main API blueprint
api_bp = Blueprint('api', __name__)


@api_bp.route('/ping', methods=['GET'])
def ping():
    """Simple ping endpoint for testing"""
    return jsonify({'message': 'pong', 'status': 'ok'}), 200


# Import route modules
from backend.api import projects  # noqa: F401, E402
from backend.api import speakers  # noqa: F401, E402
from backend.api import dialog_sessions  # noqa: F401, E402
from backend.api import generation  # noqa: F401, E402
from backend.api import datasets  # noqa: F401, E402
from backend.api import training  # noqa: F401, E402
from backend.api import tasks  # noqa: F401, E402
from backend.api import preset_voices  # noqa: F401, E402
from backend.api import quick_generate  # noqa: F401, E402


================================================
FILE: backend/api/datasets.py
================================================
"""
Datasets API endpoints (project-scoped)
"""
from flask import request, jsonify, current_app, send_file
from pathlib import Path
from backend.api import api_bp
from backend.services.dataset_service import DatasetService
from backend.services.project_service import ProjectService
from backend.i18n import t


def get_dataset_service(project_id: str) -> DatasetService:
    """Get DatasetService instance for a specific project"""
    # Get project service to verify project exists
    project_service = ProjectService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        meta_file_name=current_app.config['PROJECTS_META_FILE']
    )

    # Get project path
    project_path = project_service.get_project_path(project_id)
    if not project_path:
        return None

    # Return dataset service for project's datasets directory
    return DatasetService(project_path / 'datasets')


@api_bp.route('/projects/<project_id>/datasets', methods=['GET'])
def list_datasets(project_id):
    """
    List all datasets for a project

    Args:
        project_id: Project identifier

    Returns:
        JSON response with list of datasets
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        datasets = service.list_datasets()

        return jsonify({
            'datasets': [d.to_dict() for d in datasets],
            'count': len(datasets)
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>', methods=['GET'])
def get_dataset(project_id, dataset_id):
    """
    Get dataset by ID

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Returns:
        JSON response with dataset data
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        dataset = service.get_dataset(dataset_id)

        if not dataset:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        return jsonify(dataset.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets', methods=['POST'])
def create_dataset(project_id):
    """
    Create a new dataset

    Args:
        project_id: Project identifier

    Request body:
        {
            "name": "Dataset name",
            "description": "Dataset description"
        }

    Returns:
        JSON response with created dataset data
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name', '').strip()
        description = data.get('description', '').strip()

        if not name:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.dataset_name_required')
            }), 400

        dataset = service.create_dataset(name, description)

        return jsonify(dataset.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>', methods=['PUT'])
def update_dataset(project_id, dataset_id):
    """
    Update dataset metadata

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Request body:
        {
            "name": "Updated name",
            "description": "Updated description"
        }

    Returns:
        JSON response with updated dataset data
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name')
        description = data.get('description')

        dataset = service.update_dataset(dataset_id, name, description)

        if not dataset:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        return jsonify(dataset.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>', methods=['DELETE'])
def delete_dataset(project_id, dataset_id):
    """
    Delete dataset

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        success = service.delete_dataset(dataset_id)

        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        return jsonify({
            'message': t('success.dataset_deleted'),
            'dataset_id': dataset_id
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/export', methods=['GET'])
def export_dataset(project_id, dataset_id):
    """
    Export dataset as a zip file

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Returns:
        Zip file containing dataset
    """
    import tempfile
    import atexit
    import shutil as shutil_module

    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        dataset = service.get_dataset(dataset_id)

        if not dataset:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Create temporary export file
        temp_dir = Path(tempfile.mkdtemp())
        export_path = temp_dir / f"{dataset_id}.zip"

        # Register cleanup function to remove temp directory after response is sent
        def cleanup():
            try:
                if temp_dir.exists():
                    shutil_module.rmtree(temp_dir)
            except Exception:
                pass

        atexit.register(cleanup)

        service.export_dataset(dataset_id, export_path)

        # Send file with cleanup callback
        response = send_file(
            export_path,
            as_attachment=True,
            download_name=f"{dataset.name}.zip",
            mimetype='application/zip'
        )

        # Add cleanup callback for after response is sent
        @response.call_on_close
        def on_close():
            cleanup()

        return response

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/import', methods=['PUT'])
def import_to_existing_dataset(project_id, dataset_id):
    """
    Import data into an existing dataset (replaces all items)

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Form data:
        dataset_file: Zip file containing dataset items (required)

    Returns:
        JSON response with updated dataset data
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        dataset = service.get_dataset(dataset_id)
        if not dataset:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        dataset_file = request.files.get('dataset_file')
        if not dataset_file:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.file_upload_error')
            }), 400

        updated_dataset = service.import_to_existing_dataset(dataset_id, dataset_file)

        return jsonify(updated_dataset.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/import', methods=['POST'])
def import_dataset(project_id):
    """
    Import dataset from a zip file

    Args:
        project_id: Project identifier

    Form data:
        dataset_file: Zip file containing dataset (required)
        name: Optional name for imported dataset

    Returns:
        JSON response with imported dataset data
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        dataset_file = request.files.get('dataset_file')
        if not dataset_file:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.file_upload_error')
            }), 400

        # Get optional name from form data
        dataset_name = request.form.get('name')

        dataset = service.import_dataset(dataset_file, dataset_name)

        return jsonify(dataset.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


# Dataset Items endpoints

@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/items', methods=['GET'])
def list_dataset_items(project_id, dataset_id):
    """
    List items in a dataset with pagination support

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Query parameters:
        offset: Starting index (default: 0)
        limit: Maximum number of items to return (default: all)

    Returns:
        JSON response with paginated list of dataset items
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Get pagination parameters from query string
        offset = request.args.get('offset', 0, type=int)
        limit = request.args.get('limit', type=int)  # None if not provided

        items, total_count = service.list_items(dataset_id, offset, limit)

        return jsonify({
            'items': [item.to_dict() for item in items],
            'count': len(items),
            'total': total_count,
            'offset': offset,
            'limit': limit
        }), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/items', methods=['POST'])
def add_dataset_item(project_id, dataset_id):
    """
    Add a new item to the dataset

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier

    Form data:
        text: Text content (required)
        audio_file: Audio file (required)
        voice_prompt_files: Voice prompt files (required, multiple)

    Returns:
        JSON response with created dataset item
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Get form data
        text = request.form.get('text', '').strip()
        audio_file = request.files.get('audio_file')
        voice_prompt_files = request.files.getlist('voice_prompt_files')

        if not text:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.text_required')
            }), 400

        if not audio_file:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.audio_file_required')
            }), 400

        if not voice_prompt_files or len(voice_prompt_files) == 0:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.voice_prompts_required')
            }), 400

        item = service.add_item(dataset_id, text, audio_file, voice_prompt_files)

        return jsonify(item.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/items/<int:item_index>', methods=['PUT'])
def update_dataset_item(project_id, dataset_id, item_index):
    """
    Update a dataset item

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier
        item_index: Index of item to update (0-based)

    Form data:
        text: Text content (optional)
        audio_file: Audio file (optional)
        voice_prompt_files: Voice prompt files (optional, multiple)

    Returns:
        JSON response with updated dataset item
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Get form data
        text = request.form.get('text')
        audio_file = request.files.get('audio_file')
        voice_prompt_files = request.files.getlist('voice_prompt_files')

        # Convert empty list to None
        if voice_prompt_files and len(voice_prompt_files) == 0:
            voice_prompt_files = None

        item = service.update_item(dataset_id, item_index, text, audio_file, voice_prompt_files)

        if not item:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.item_not_found')
            }), 404

        return jsonify(item.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/items/<int:item_index>', methods=['DELETE'])
def delete_dataset_item(project_id, dataset_id, item_index):
    """
    Delete a dataset item

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier
        item_index: Index of item to delete (0-based)

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        success = service.delete_item(dataset_id, item_index)

        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.item_not_found')
            }), 404

        return jsonify({
            'message': t('success.item_deleted'),
            'item_index': item_index
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


# Dataset file serving endpoints

@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/audio/<filename>', methods=['GET'])
def get_dataset_audio(project_id, dataset_id, filename):
    """
    Serve audio file from dataset

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier
        filename: Audio filename

    Returns:
        Audio file
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Get audio file path
        audio_dir = service._get_audio_dir(dataset_id)
        audio_path = audio_dir / filename

        if not audio_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': 'Audio file not found'
            }), 404

        return send_file(
            audio_path,
            mimetype='audio/wav',
            as_attachment=False,
            download_name=filename
        )

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/datasets/<dataset_id>/voice-prompts/<filename>', methods=['GET'])
def get_dataset_voice_prompt(project_id, dataset_id, filename):
    """
    Serve voice prompt file from dataset

    Args:
        project_id: Project identifier
        dataset_id: Dataset identifier
        filename: Voice prompt filename

    Returns:
        Voice prompt audio file
    """
    try:
        service = get_dataset_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        if not service.get_dataset(dataset_id):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.dataset_not_found')
            }), 404

        # Get voice prompt file path
        voice_prompts_dir = service._get_voice_prompts_dir(dataset_id)
        voice_prompt_path = voice_prompts_dir / filename

        if not voice_prompt_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': 'Voice prompt file not found'
            }), 404

        return send_file(
            voice_prompt_path,
            mimetype='audio/wav',
            as_attachment=False,
            download_name=filename
        )

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/dialog_sessions.py
================================================
"""
Dialog Sessions API endpoints
"""
from flask import request, jsonify, current_app, send_file
from backend.api import api_bp
from backend.services.dialog_session_service import DialogSessionService
from backend.services.speaker_service import SpeakerService
from backend.services.project_service import ProjectService
from backend.i18n import t


def get_dialog_session_service(project_id: str) -> DialogSessionService:
    """Get DialogSessionService instance for a specific project"""
    # Get project service to find project directory
    project_service = ProjectService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        meta_file_name=current_app.config['PROJECTS_META_FILE']
    )

    # Get project path
    project_path = project_service.get_project_path(project_id)
    if not project_path:
        return None

    # Get speaker service for validation
    speaker_service = SpeakerService(project_path / 'voices')

    # Return dialog session service for project's scripts directory
    return DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)


@api_bp.route('/projects/<project_id>/sessions', methods=['GET'])
def list_sessions(project_id):
    """
    List all dialog sessions for a project

    Args:
        project_id: Project identifier

    Returns:
        JSON response with list of sessions
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        sessions = service.list_sessions()

        return jsonify({
            'sessions': [s.to_dict() for s in sessions],
            'count': len(sessions)
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions/<session_id>', methods=['GET'])
def get_session(project_id, session_id):
    """
    Get dialog session by ID

    Args:
        project_id: Project identifier
        session_id: Session identifier

    Returns:
        JSON response with session data
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        session = service.get_session(session_id)
        if not session:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.session_not_found')
            }), 404

        return jsonify(session.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions', methods=['POST'])
def create_session(project_id):
    """
    Create a new dialog session

    Args:
        project_id: Project identifier

    Request body:
        {
            "name": "Session name",
            "description": "Session description",
            "dialog_text": "Speaker 1: Hello\\n\\nSpeaker 2: Hi",
            "mode": "dialogue",  // or "narration"
            "narrator_speaker_id": "Speaker 1"  // required if mode="narration"
        }

    Returns:
        JSON response with created session data
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name')
        description = data.get('description', '')
        dialog_text = data.get('dialog_text', '')  # Default to empty string if not provided
        mode = data.get('mode', 'dialogue')  # Default to dialogue mode
        narrator_speaker_id = data.get('narrator_speaker_id')

        if not name:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.validation_error')
            }), 400

        # Allow empty dialog_text - user can add dialogs later
        session = service.create_session(
            name, description, dialog_text,
            mode=mode, narrator_speaker_id=narrator_speaker_id
        )

        return jsonify(session.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions/<session_id>', methods=['PUT'])
def update_session(project_id, session_id):
    """
    Update dialog session metadata and/or text

    Args:
        project_id: Project identifier
        session_id: Session identifier

    Request body:
        {
            "name": "Updated name",
            "description": "Updated description",
            "dialog_text": "Speaker 1: Updated text",
            "mode": "dialogue",  // or "narration"
            "narrator_speaker_id": "Speaker 1"  // required if mode="narration"
        }

    Returns:
        JSON response with updated session data
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name')
        description = data.get('description')
        dialog_text = data.get('dialog_text')
        mode = data.get('mode')
        narrator_speaker_id = data.get('narrator_speaker_id')

        session = service.update_session(
            session_id, name, description, dialog_text,
            mode=mode, narrator_speaker_id=narrator_speaker_id
        )
        if not session:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.session_not_found')
            }), 404

        return jsonify(session.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions/<session_id>', methods=['DELETE'])
def delete_session(project_id, session_id):
    """
    Delete dialog session and its text file

    Args:
        project_id: Project identifier
        session_id: Session identifier

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        success = service.delete_session(session_id)
        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.session_not_found')
            }), 404

        return jsonify({
            'message': t('success.session_deleted'),
            'session_id': session_id
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions/<session_id>/text', methods=['GET'])
def get_session_text(project_id, session_id):
    """
    Get dialog text content for a session

    Args:
        project_id: Project identifier
        session_id: Session identifier

    Returns:
        JSON response with dialog text content
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        text = service.get_session_text(session_id)
        if text is None:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.session_not_found')
            }), 404

        return jsonify({
            'session_id': session_id,
            'dialog_text': text
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/sessions/<session_id>/download', methods=['GET'])
def download_session_text(project_id, session_id):
    """
    Download dialog text file for a session

    Args:
        project_id: Project identifier
        session_id: Session identifier

    Returns:
        Text file or error
    """
    try:
        service = get_dialog_session_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        text_file_path = service.get_text_file_path(session_id)
        if not text_file_path or not text_file_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.session_not_found')
            }), 404

        return send_file(text_file_path, as_attachment=True, download_name=f'dialog_{session_id}.txt')

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/generation.py
================================================
"""
Generation API endpoints
"""
from uuid import uuid4
from typing import Dict, Any
from flask import request, jsonify, current_app, send_file
from backend.api import api_bp
from backend.inference.inference import InferenceBase
from backend.models.generation import Generation
from backend.services.voice_gerneration_service import VoiceGenerationService
from backend.services.dialog_session_service import DialogSessionService
from backend.services.speaker_service import SpeakerService
from backend.services.project_service import ProjectService
from backend.task_manager.task import gm, Task
from backend.i18n import t
from config.configuration_vibevoice import InferencePhase
from util.logger import get_logger

logger = get_logger(__name__)

def _enrich_generation_with_session_name(generation: Generation, dialog_service: DialogSessionService) -> Dict[str, Any]:
    """
    Enrich generation dict with session_name field.

    If session is not found (deleted), returns a placeholder.

    Args:
        generation: Generation object
        dialog_service: DialogSessionService to look up session

    Returns:
        Generation dict with session_name added
    """
    gen_dict = generation.to_dict()

    # Try to get session name
    try:
        session = dialog_service.get_session(generation.session_id)
        if session:
            gen_dict['session_name'] = session.name
        else:
            # Session was deleted
            gen_dict['session_name'] = t('session.deleted')
    except Exception as e:
        logger.warning(f"Failed to get session name for session_id {generation.session_id}: {e}")
        gen_dict['session_name'] = t('session.deleted')

    return gen_dict


def _validate_offloading_config(offloading: dict) -> dict:
    """
    Validate offloading configuration from request.

    Args:
        offloading: Offloading config dict from request (can be None)

    Returns:
        Validated offloading config dict or None if disabled/not provided

    Raises:
        ValueError: If config is invalid
    """
    # If not provided or disabled, return None (backward compatible)
    if not offloading or not offloading.get('enabled', False):
        return None

    mode = offloading.get('mode', 'preset')

    # Validate mode
    if mode not in ['preset', 'manual']:
        raise ValueError(f"Invalid offloading mode: '{mode}'. Must be 'preset' or 'manual'")

    if mode == 'preset':
        preset = offloading.get('preset', 'balanced')
        valid_presets = ['balanced', 'aggressive', 'extreme']
        if preset not in valid_presets:
            raise ValueError(f"Invalid preset: '{preset}'. Must be one of: {', '.join(valid_presets)}")

        return {
            'enabled': True,
            'mode': 'preset',
            'preset': preset
        }

    elif mode == 'manual':
        num_gpu_layers = offloading.get('num_gpu_layers')
        if num_gpu_layers is None:
            raise ValueError("num_gpu_layers is required for manual mode")

        if not isinstance(num_gpu_layers, int) or num_gpu_layers < 1 or num_gpu_layers > 28:
            raise ValueError(f"num_gpu_layers must be an integer between 1 and 28, got: {num_gpu_layers}")

        return {
            'enabled': True,
            'mode': 'manual',
            'num_gpu_layers': num_gpu_layers
        }


@api_bp.route('/projects/<project_id>/generations', methods=['POST'])
def get_voice_generation_service(project_id: str):
    """Get VoiceGenerationService instance for a specific project"""
    try:
        data = request.get_json()

        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        dialog_session_id = data.get('dialog_session_id')
        if dialog_session_id is None:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.session_required')
            }), 400

        request_id = uuid4().hex
        seeds = data.get('seeds', 42)
        cfg_scale = data.get('cfg_scale', 1.3)
        model_dtype = data.get('model_dtype', 'float8_e4m3fn')
        attn_implementation = data.get('attn_implementation', 'sdpa')
        lora_model_path = data.get('lora_model_path', None)
        lora_weight = data.get('lora_weight', 1.0)
        batch_size = data.get('batch_size', 1)

        # Parse and validate offloading configuration (NEW)
        offloading_config = data.get('offloading')
        try:
            validated_offloading = _validate_offloading_config(offloading_config)
        except ValueError as e:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': str(e)
            }), 400

        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        # Return dialog session service for project's scripts directory
        service = VoiceGenerationService(project_path / 'output',
                                         speaker_service=speaker_service,
                                         dialog_service=dialog_service,
                                         fake_model=current_app.config.get('FAKE_MODEL', False))

        generation: Generation = service.generation(dialog_session_id,
                                                    request_id,
                                                    seeds=seeds,
                                                    cfg_scale=cfg_scale,
                                                    model_dtype=model_dtype,
                                                    attn_implementation=attn_implementation,
                                                    project_id=project_id,
                                                    offloading_config=validated_offloading,
                                                    lora_model_path=lora_model_path,
                                                    lora_weight=lora_weight,
                                                    batch_size=batch_size)
        if not generation:
            return jsonify({
                'error': t('errors.internal_error'),
                'message': t('errors.generation_already_running')
            }), 500
        return jsonify({
            'message': t('success.generation_started'),
            'request_id': generation.request_id,
            'generation': _enrich_generation_with_session_name(generation, dialog_service)
        }), 200
    except Exception as e:
        logger.error("Error occurred while starting voice generation:", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500

@api_bp.route('/projects/generations/current', methods=['GET'])
def get_current_generation():
    """
    Get the current generation status globally (for any project).
    Used by the navigation task indicator.

    Returns:
        JSON response with current generation status (200 with null if none active)
    """
    task: Task = gm.get_current_task()
    if not task or not isinstance(task.unwrap(), InferenceBase):
        return jsonify({
            'message': 'No active generation task at the moment',
            'generation': None
        }), 200
    inference: InferenceBase = task.unwrap()
    generation: Generation = inference.generation

    # Enrich with session name
    try:
        if generation.project_id:
            project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                             meta_file_name=current_app.config['PROJECTS_META_FILE'])
            project_path = project_service.get_project_path(generation.project_id)

            if project_path:
                speaker_service = SpeakerService(project_path / 'voices')
                dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
                gen_dict = _enrich_generation_with_session_name(generation, dialog_service)
            else:
                gen_dict = generation.to_dict()
                gen_dict['session_name'] = t('session.deleted')
        else:
            gen_dict = generation.to_dict()
            gen_dict['session_name'] = t('session.unknown')
    except Exception as e:
        logger.warning(f"Failed to enrich current generation with session name: {e}")
        gen_dict = generation.to_dict()
        gen_dict['session_name'] = t('session.unknown')

    return jsonify({
        'message': 'Current generation status retrieved successfully',
        'generation': gen_dict
    }), 200


@api_bp.route('/projects/<project_id>/generations/current', methods=['GET'])
def get_current_generation_for_project(project_id: str):
    """
    Get the current generation status for a specific project.
    Only returns the generation if it belongs to the specified project.

    Args:
        project_id: Project identifier
    Returns:
        JSON response with current generation status (200 with null if none active for this project)
    """
    task: Task = gm.get_current_task()
    if not task or not isinstance(task.unwrap(), InferenceBase):
        return jsonify({
            'message': 'No active generation task at the moment',
            'generation': None
        }), 200

    inference: InferenceBase = task.unwrap()
    generation: Generation = inference.get_generation()

    # Check if the generation belongs to the requested project
    if generation.project_id != project_id:
        return jsonify({
            'message': 'No active generation task for this project',
            'generation': None
        }), 200

    # Enrich with session name
    try:
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])
        project_path = project_service.get_project_path(project_id)

        if project_path:
            speaker_service = SpeakerService(project_path / 'voices')
            dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
            gen_dict = _enrich_generation_with_session_name(generation, dialog_service)
        else:
            gen_dict = generation.to_dict()
            gen_dict['session_name'] = t('session.deleted')
    except Exception as e:
        logger.warning(f"Failed to enrich current generation with session name: {e}")
        gen_dict = generation.to_dict()
        gen_dict['session_name'] = t('session.unknown')

    return jsonify({
        'message': 'Current generation status retrieved successfully',
        'generation': gen_dict
    }), 200

@api_bp.route('/projects/<project_id>/generations', methods=['GET'])
def get_all_generations(project_id):
    """
    Get all generations for a project

    Args:
        project_id: Project identifier
    Returns:
        JSON response with list of generations for the project
    """
    try:
        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        # Return dialog session service for project's scripts directory
        service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

        generations = service.list_generations()

        # Enrich all generations with session names
        enriched_generations = [_enrich_generation_with_session_name(g, dialog_service) for g in generations]

        for gen in generations:
            if gen.status == InferencePhase.PENDING:
                gen_failed = True
                task = gm.get_current_task()
                if task and isinstance(task.unwrap(), InferenceBase):
                    inference: InferenceBase = task.unwrap()
                    current_gen: Generation = inference.get_generation()
                    if current_gen and current_gen.request_id == gen.request_id and current_gen.project_id == project_id:
                        gen_failed = False
                if gen_failed:
                    gen.status = InferencePhase.FAILED
                    service._save_metadata([g.to_dict() for g in generations])

        return jsonify({
            'generations': enriched_generations,
            'count': len(enriched_generations)
        }), 200

    except Exception as e:
        logger.error(f"Error occurred while retrieving generations: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500

@api_bp.route('/projects/<project_id>/generations/<request_id>/download', methods=['GET'])
def download_generation_audio(project_id: str, request_id: str):
    """
    Download or stream the generated audio file for a specific generation

    Query parameters:
        - download: If set to 'true', force download. Otherwise, serve inline for playback.

    Args:
        project_id: Project identifier
        request_id: Generation request identifier
    Returns:
        Audio file (inline or as attachment) or error message
    """
    try:
        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        # Return dialog session service for project's scripts directory
        service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

        generations = service.list_generations()
        generation = next((g for g in generations if g.request_id == request_id), None)
        if not generation:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_not_found')
            }), 400

        audio_file_path = project_path / 'output' / generation.output_filename
        if not audio_file_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_not_found')
            }), 400

        # Check if download parameter is set to true
        force_download = request.args.get('download', 'false').lower() == 'true'

        # Serve inline by default (for playback), or as attachment if requested
        return send_file(
            str(audio_file_path),
            mimetype='audio/wav',
            as_attachment=force_download,
            download_name=generation.output_filename if force_download else None
        )

    except Exception as e:
        logger.error(f"Error occurred while downloading generated audio: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500

@api_bp.route('/projects/<project_id>/generations/<request_id>/items/<int:item_index>/download', methods=['GET'])
def download_generation_item_audio(project_id: str, request_id: str, item_index: int):
    """
    Download or stream an individual audio file from a multi-generation batch

    Query parameters:
        - download: If set to 'true', force download. Otherwise, serve inline for playback.

    Args:
        project_id: Project identifier
        request_id: Generation request identifier
        item_index: Index of the generation item (0-based)
    Returns:
        Audio file (inline or as attachment) or error message
    """
    try:
        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # First, check if this is the current active generation (in memory)
        generation = None
        task: Task = gm.get_current_task()
        if task and isinstance(task.unwrap(), InferenceBase):
            inference: InferenceBase = task.unwrap()
            current_gen = inference.get_generation()
            if current_gen and current_gen.request_id == request_id and current_gen.project_id == project_id:
                generation = current_gen

        # If not found in current task, look in saved generations
        if not generation:
            speaker_service = SpeakerService(project_path / 'voices')
            dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
            service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

            generations = service.list_generations()
            generation = next((g for g in generations if g.request_id == request_id), None)

        if not generation:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_not_found')
            }), 404

        # Check if this is a multi-generation with items
        # Handle both dict and GenerationDetails object
        details = generation.details
        if details is None:
            generation_items = []
        elif isinstance(details, dict):
            generation_items = details.get('generation_items', [])
        else:
            generation_items = details.generation_items if hasattr(details, 'generation_items') else []

        if not generation_items:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_item_not_found')
            }), 404

        # Validate item index
        if item_index < 0 or item_index >= len(generation_items):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_item_not_found')
            }), 404

        item = generation_items[item_index]
        # Handle both dict and GenerationItem object
        audio_path = item.get('audio_path') if isinstance(item, dict) else item.audio_path
        audio_file_path = project_path / 'output' / audio_path
        if not audio_file_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_item_not_found')
            }), 404

        # Check if download parameter is set to true
        force_download = request.args.get('download', 'false').lower() == 'true'

        # Get the filename from the path
        filename = audio_file_path.name

        # Serve inline by default (for playback), or as attachment if requested
        return send_file(
            str(audio_file_path),
            mimetype='audio/wav',
            as_attachment=force_download,
            download_name=filename if force_download else None
        )

    except Exception as e:
        logger.error(f"Error occurred while downloading generation item audio: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/generations/<request_id>', methods=['GET'])
def get_generation(project_id: str, request_id: str):
    """
    Get a specific generation by request ID

    Args:
        project_id: Project identifier
        request_id: Generation request identifier
    Returns:
        JSON response with generation data
    """
    try:
        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

        # Get all generations and find the one with matching request_id
        generations = service.list_generations()
        generation = next((g for g in generations if g.request_id == request_id), None)

        if not generation:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_not_found')
            }), 404

        return jsonify({
            'generation': _enrich_generation_with_session_name(generation, dialog_service)
        }), 200

    except Exception as e:
        logger.error(f"Error occurred while retrieving generation: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500

@api_bp.route('/projects/<project_id>/generations/<request_id>', methods=['DELETE'])
def delete_generation(project_id: str, request_id: str):
    """
    Delete a specific generation and its audio file

    Args:
        project_id: Project identifier
        request_id: Generation request identifier
    Returns:
        JSON response with deletion status
    """
    try:
        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

        # Delete the generation
        success = service.delete_generation(request_id)
        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.generation_not_found')
            }), 404

        return jsonify({
            'message': t('success.generation_deleted'),
            'request_id': request_id
        }), 200

    except Exception as e:
        logger.error(f"Error occurred while deleting generation: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500

@api_bp.route('/projects/<project_id>/generations/batch-delete', methods=['POST'])
def batch_delete_generations(project_id: str):
    """
    Delete multiple generations and their audio files

    Request body:
        {
            "request_ids": ["id1", "id2", ...]
        }

    Args:
        project_id: Project identifier
    Returns:
        JSON response with batch deletion results
    """
    try:
        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        request_ids = data.get('request_ids', [])
        if not request_ids or not isinstance(request_ids, list):
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.validation_error')
            }), 400

        # Get project service to find project directory
        project_service = ProjectService(workspace_dir=current_app.config['WORKSPACE_DIR'],
                                         meta_file_name=current_app.config['PROJECTS_META_FILE'])

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get speaker service for validation
        speaker_service = SpeakerService(project_path / 'voices')
        dialog_service = DialogSessionService(project_path / 'scripts', speaker_service=speaker_service)
        service = VoiceGenerationService(project_path / 'output', speaker_service=speaker_service, dialog_service=dialog_service)

        # Delete generations in batch
        result = service.delete_generations_batch(request_ids)

        return jsonify({
            'message': t('success.generations_deleted'),
            'deleted_count': result['deleted_count'],
            'failed_count': result['failed_count'],
            'deleted_ids': result['deleted_ids'],
            'failed_ids': result['failed_ids']
        }), 200

    except Exception as e:
        logger.error(f"Error occurred while batch deleting generations: {e}")
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/openai_compat.py
================================================
"""
OpenAI-Compatible TTS API endpoint

Implements POST /v1/audio/speech for drop-in compatibility with OpenAI TTS clients.
This uses a separate blueprint registered at /v1 (not /api/v1) to match the OpenAI URL scheme.
"""
from flask import Blueprint, request, jsonify, send_file, current_app

from backend.services.openai_compat_service import (
    OpenAICompatService, MODEL_MAPPING, FORMAT_MIME_TYPES,
)
from util.logger import get_logger

logger = get_logger(__name__)

# Separate blueprint for OpenAI-compatible routes (mounted at /v1)
openai_bp = Blueprint('openai_compat', __name__)


def _openai_error(message: str, error_type: str = "invalid_request_error",
                  code: str = None, status: int = 400) -> tuple:
    """Return an OpenAI-style error response."""
    body = {
        "error": {
            "message": message,
            "type": error_type,
        }
    }
    if code:
        body["error"]["code"] = code
    return jsonify(body), status


def _get_service() -> OpenAICompatService:
    """Get OpenAICompatService instance from app config."""
    return OpenAICompatService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        preset_dir=current_app.config['PRESET_VOICE_DIR'],
        fake_model=current_app.config.get('FAKE_MODEL', False),
    )


@openai_bp.route('/audio/speech', methods=['POST'])
def create_speech():
    """
    OpenAI-compatible TTS endpoint.

    Request (application/json):
        {
            "model": "vibevoice-7b",         // Required
            "input": "Hello world",           // Required, max 4096 chars
            "voice": "Alice",                 // Required, preset voice name
            "response_format": "wav",         // Optional, default: wav (supports: wav, mp3, flac, opus, aac, pcm)
            "speed": 1.0                      // Optional, accepted but ignored
        }

    Response:
        Binary audio data with appropriate Content-Type header.
    """
    service = _get_service()

    # --- Authentication ---
    auth_header = request.headers.get('Authorization')
    if not service.validate_api_key(auth_header):
        return _openai_error(
            "Invalid API key provided.",
            error_type="authentication_error",
            code="invalid_api_key",
            status=401,
        )

    # --- Parse JSON body ---
    if not request.is_json:
        return _openai_error("Request body must be JSON (Content-Type: application/json).")

    data = request.get_json(silent=True)
    if not data:
        return _openai_error("Invalid JSON in request body.")

    # --- Validate required fields ---
    model = data.get('model')
    if not model:
        return _openai_error("Missing required parameter: 'model'.", code="missing_model")

    input_text = data.get('input')
    if not input_text:
        return _openai_error("Missing required parameter: 'input'.", code="missing_input")

    if len(input_text) > 4096:
        return _openai_error(
            f"Input text is too long ({len(input_text)} chars). Maximum is 4096 characters.",
            code="input_too_long",
        )

    voice = data.get('voice')
    if not voice:
        return _openai_error("Missing required parameter: 'voice'.", code="missing_voice")

    # --- Validate optional fields ---
    response_format = data.get('response_format', 'wav')
    if response_format not in FORMAT_MIME_TYPES:
        supported = ', '.join(sorted(FORMAT_MIME_TYPES.keys()))
        return _openai_error(
            f"Unsupported response_format '{response_format}'. Supported formats: {supported}",
            code="unsupported_format",
        )

    # speed is accepted but ignored (not supported by engine)
    # data.get('speed', 1.0)

    # --- Resolve model (fallback to bf16 if unknown) ---
    model_dtype, err = service.resolve_model(model)
    if err:
        logger.warning(f"Unknown model '{model}', falling back to bf16")
        model_dtype = 'bf16'

    # --- Resolve voice ---
    voice_filename, err = service.resolve_voice(voice)
    if err:
        return _openai_error(err, code="voice_not_found")

    # --- Generate speech ---
    try:
        audio_path, err, status_code = service.generate_speech(
            text=input_text,
            voice_filename=voice_filename,
            model_dtype=model_dtype,
            response_format=response_format,
        )
    except Exception as e:
        logger.error(f"Unexpected error in speech generation: {e}", exc_info=True)
        return _openai_error(
            "An internal error occurred during speech generation.",
            error_type="server_error",
            status=500,
        )

    if err:
        error_type = "server_error" if status_code >= 500 else "invalid_request_error"
        return _openai_error(err, error_type=error_type, status=status_code)

    # --- Return audio ---
    mime_type = FORMAT_MIME_TYPES[response_format]
    # Map format to download extension (opus→ogg, pcm→raw)
    ext_map = {'opus': 'ogg', 'pcm': 'raw'}
    ext = ext_map.get(response_format, response_format)
    return send_file(
        str(audio_path),
        mimetype=mime_type,
        as_attachment=False,
        download_name=f"speech.{ext}",
    )


@openai_bp.route('/models', methods=['GET'])
def list_models():
    """
    List available models (OpenAI-compatible format).

    Returns a list of model objects matching OpenAI's /v1/models response format.
    """
    models = []
    seen = set()
    for model_name in sorted(MODEL_MAPPING.keys()):
        if model_name not in seen:
            seen.add(model_name)
            models.append({
                "id": model_name,
                "object": "model",
                "created": 0,
                "owned_by": "vibevoice",
            })

    return jsonify({
        "object": "list",
        "data": models,
    })


================================================
FILE: backend/api/preset_voices.py
================================================
"""
Preset voices API endpoints

Filename convention: {language}-{name}_{gender}[_bgm].wav
The filename serves as the unique identifier for each preset.
"""
from flask import request, jsonify, current_app, send_file
from backend.api import api_bp
from backend.services.preset_voice_service import PresetVoiceService
from backend.i18n import t, get_locale


def get_preset_service() -> PresetVoiceService:
    """Get PresetVoiceService instance"""
    return PresetVoiceService(current_app.config['PRESET_VOICE_DIR'])


@api_bp.route('/preset-voices', methods=['GET'])
def list_preset_voices():
    """
    List all preset voices with optional filtering and pagination

    Query Parameters:
        language: Filter by language code (en, zh, in)
        gender: Filter by gender (man, woman)
        has_bgm: Filter by BGM presence (true, false)
        offset: Number of items to skip (default: 0)
        limit: Maximum items per page (optional)

    Returns:
        JSON response with list of preset voices and pagination info
    """
    try:
        service = get_preset_service()
        locale = get_locale()

        # Parse query parameters
        language = request.args.get('language')
        gender = request.args.get('gender')
        has_bgm_param = request.args.get('has_bgm')
        has_bgm = None
        if has_bgm_param is not None:
            has_bgm = has_bgm_param.lower() == 'true'

        offset = request.args.get('offset', 0, type=int)
        limit = request.args.get('limit', type=int)

        presets, total = service.list_presets(
            language=language,
            gender=gender,
            has_bgm=has_bgm,
            offset=offset,
            limit=limit,
            locale=locale
        )

        return jsonify({
            'presets': [p.to_dict() for p in presets],
            'count': len(presets),
            'total': total,
            'offset': offset,
            'limit': limit
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices', methods=['POST'])
def add_preset_voice():
    """
    Add a new preset voice

    Form data:
        name: Voice name - letters only, will be capitalized (required)
        language: Language code - 'en', 'zh', or 'in' (required)
        gender: Gender - 'man' or 'woman' (required)
        has_bgm: Whether has background music - 'true' or 'false' (default: false)
        voice_file: Audio file (required)

    The file will be saved as: {language}-{name}_{gender}[_bgm].wav

    Returns:
        JSON response with created preset voice
    """
    try:
        service = get_preset_service()

        # Get form data
        name = request.form.get('name', '').strip()
        language = request.form.get('language', '').strip()
        gender = request.form.get('gender', '').strip()
        has_bgm_param = request.form.get('has_bgm', 'false')
        has_bgm = has_bgm_param.lower() == 'true'
        voice_file = request.files.get('voice_file')

        if not name:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.preset_name_required')
            }), 400

        if not language:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.preset_language_required')
            }), 400

        if not gender:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': t('errors.preset_gender_required')
            }), 400

        if not voice_file:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.file_upload_error')
            }), 400

        preset = service.add_preset(name, language, gender, has_bgm, voice_file)

        return jsonify(preset.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices/<path:filename>', methods=['GET'])
def get_preset_voice(filename):
    """
    Get preset voice by filename

    Args:
        filename: Preset filename (e.g., "en-Alice_woman.wav")

    Returns:
        JSON response with preset voice data
    """
    try:
        service = get_preset_service()
        locale = get_locale()
        preset = service.get_preset(filename, locale)

        if not preset:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.preset_voice_not_found')
            }), 404

        return jsonify(preset.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices/<path:filename>', methods=['DELETE'])
def delete_preset_voice(filename):
    """
    Delete preset voice

    Args:
        filename: Preset filename (e.g., "en-Alice_woman.wav")

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_preset_service()

        success = service.delete_preset(filename)
        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.preset_voice_not_found')
            }), 404

        return jsonify({
            'message': t('success.preset_voice_deleted'),
            'filename': filename
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices/batch-delete', methods=['POST'])
def batch_delete_preset_voices():
    """
    Delete multiple preset voices

    Request body (JSON):
        filenames: List of preset filenames to delete

    Returns:
        JSON response with deletion results
    """
    try:
        service = get_preset_service()

        data = request.get_json()
        if not data or 'filenames' not in data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'filenames is required'
            }), 400

        filenames = data.get('filenames', [])
        if not isinstance(filenames, list):
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'filenames must be a list'
            }), 400

        deleted, failed = service.batch_delete_presets(filenames)

        return jsonify({
            'message': t('success.preset_voices_deleted'),
            'deleted_count': len(deleted),
            'failed_count': len(failed),
            'deleted': deleted,
            'failed': failed
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices/languages', methods=['GET'])
def list_preset_languages():
    """Get available languages for preset voices"""
    try:
        service = get_preset_service()
        locale = get_locale()
        languages = service.get_available_languages(locale=locale)
        return jsonify({
            'languages': languages
        }), 200
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/preset-voices/<path:filename>/preview', methods=['GET'])
def preview_preset_voice(filename):
    """
    Get audio file for preview/playback

    Args:
        filename: Preset filename (e.g., "en-Alice_woman.wav")

    Returns:
        Audio file for streaming
    """
    try:
        service = get_preset_service()
        file_path = service.get_preset_path(filename)

        if not file_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.preset_voice_not_found')
            }), 404

        return send_file(file_path, mimetype='audio/wav')

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/projects.py
================================================
"""
Projects API endpoints
"""
import re
from flask import request, jsonify, current_app
from backend.api import api_bp
from backend.services.project_service import ProjectService
from backend.i18n import t


def get_project_service() -> ProjectService:
    """Get ProjectService instance with current app config"""
    return ProjectService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        meta_file_name=current_app.config['PROJECTS_META_FILE']
    )


def validate_project_name(name: str) -> tuple[bool, str]:
    """
    Validate project name according to rules:
    - Must start with an alphabet character (a-z, A-Z)
    - Can include: alphabet, numbers, underscore (_), hyphen (-), and space
    - Spaces can only appear in the middle (not at start or end)

    Args:
        name: Project name to validate

    Returns:
        Tuple of (is_valid, error_message)
    """
    if not name:
        return False, t('errors.project_name_required')

    # Check if name starts or ends with space
    if name.startswith(' ') or name.endswith(' '):
        return False, t('errors.project_name_invalid_chars')

    # Check if first character is an alphabet
    if not name[0].isalpha():
        return False, t('errors.invalid_project_name')

    # Check if all characters are valid (alphabet, number, _, -, or space)
    # Pattern: starts with letter, followed by any combination of letters, numbers, _, -, or spaces
    pattern = r'^[a-zA-Z][a-zA-Z0-9_\- ]*$'
    if not re.match(pattern, name):
        return False, t('errors.project_name_invalid_chars')

    return True, ""


@api_bp.route('/projects', methods=['GET'])
def list_projects():
    """
    List all projects

    Returns:
        JSON response with list of projects
    """
    try:
        service = get_project_service()
        projects = service.list_projects()

        return jsonify({
            'projects': [p.to_dict() for p in projects],
            'count': len(projects)
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>', methods=['GET'])
def get_project(project_id):
    """
    Get project by ID

    Args:
        project_id: Project identifier

    Returns:
        JSON response with project data
    """
    try:
        service = get_project_service()
        project = service.get_project(project_id)

        if not project:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        return jsonify(project.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects', methods=['POST'])
def create_project():
    """
    Create a new project

    Request body:
        {
            "name": "Project Name",
            "description": "Optional description"
        }

    Returns:
        JSON response with created project data
    """
    try:
        data = request.get_json()

        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name')
        if not name:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.project_name_required')
            }), 400

        # Validate project name
        is_valid, error_message = validate_project_name(name)
        if not is_valid:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': error_message
            }), 400

        description = data.get('description', '')

        service = get_project_service()
        project = service.create_project(name, description)

        return jsonify(project.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>', methods=['PUT'])
def update_project(project_id):
    """
    Update project metadata

    Args:
        project_id: Project identifier

    Request body:
        {
            "name": "Updated Name",
            "description": "Updated description"
        }

    Returns:
        JSON response with updated project data
    """
    try:
        data = request.get_json()

        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        name = data.get('name')
        description = data.get('description')

        # Validate project name if provided
        if name is not None:
            is_valid, error_message = validate_project_name(name)
            if not is_valid:
                return jsonify({
                    'error': t('errors.validation_error'),
                    'message': error_message
                }), 400

        service = get_project_service()
        project = service.update_project(project_id, name, description)

        if not project:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        return jsonify(project.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>', methods=['DELETE'])
def delete_project(project_id):
    """
    Delete project and its directory

    Args:
        project_id: Project identifier

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_project_service()
        success = service.delete_project(project_id)

        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        return jsonify({
            'message': t('success.project_deleted'),
            'project_id': project_id
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/quick_generate.py
================================================
"""
Quick Generate API endpoints
"""
import json
from flask import request, jsonify, current_app, send_file
from werkzeug.utils import secure_filename

from backend.api import api_bp
from backend.services.quick_generate_service import QuickGenerateService
from backend.inference.quick_generate_inference import QuickGenerateInferenceBase
from backend.task_manager.task import gm, Task
from backend.i18n import t
from config.configuration_vibevoice import InferencePhase
from util.logger import get_logger

logger = get_logger(__name__)

# Allowed audio file extensions
ALLOWED_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac', 'webm', 'ogg'}


def _allowed_file(filename: str) -> bool:
    """Check if file has an allowed extension"""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


def _get_quick_generate_service() -> QuickGenerateService:
    """Get QuickGenerateService instance"""
    return QuickGenerateService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        fake_model=current_app.config.get('FAKE_MODEL', False)
    )


def _validate_offloading_config(offloading: dict) -> dict:
    """
    Validate offloading configuration from request.

    Args:
        offloading: Offloading config dict from request (can be None)

    Returns:
        Validated offloading config dict or None if disabled/not provided

    Raises:
        ValueError: If config is invalid
    """
    if not offloading or not offloading.get('enabled', False):
        return None

    mode = offloading.get('mode', 'preset')

    if mode not in ['preset', 'manual']:
        raise ValueError(f"Invalid offloading mode: '{mode}'. Must be 'preset' or 'manual'")

    if mode == 'preset':
        preset = offloading.get('preset', 'balanced')
        valid_presets = ['balanced', 'aggressive', 'extreme']
        if preset not in valid_presets:
            raise ValueError(f"Invalid preset: '{preset}'. Must be one of: {', '.join(valid_presets)}")

        return {
            'enabled': True,
            'mode': 'preset',
            'preset': preset
        }

    elif mode == 'manual':
        num_gpu_layers = offloading.get('num_gpu_layers')
        if num_gpu_layers is None:
            raise ValueError("num_gpu_layers is required for manual mode")

        if not isinstance(num_gpu_layers, int) or num_gpu_layers < 1 or num_gpu_layers > 28:
            raise ValueError(f"num_gpu_layers must be an integer between 1 and 28, got: {num_gpu_layers}")

        return {
            'enabled': True,
            'mode': 'manual',
            'num_gpu_layers': num_gpu_layers
        }


@api_bp.route('/quick-generate', methods=['POST'])
def start_quick_generation():
    """
    Start a new quick generation task.

    Form Data:
        - voice_file_0, voice_file_1, ... voice_file_3: Audio files (at least one required, max 4)
        - text: Text to generate (required)
        - seeds: Random seed (optional, default: random)
        - batch_size: Number of generations 1-20 (optional, default: 1)
        - cfg_scale: CFG scale (optional, default: 1.3)
        - model_dtype: Model dtype (optional, default: "bf16")
        - offloading: JSON string of offloading config (optional)

    Returns:
        JSON with request_id, detected_mode, and status
    """
    try:
        # Collect voice files (support up to 4)
        voice_files = []
        for i in range(4):
            key = f'voice_file_{i}'
            if key in request.files:
                voice_file = request.files[key]
                if voice_file.filename and voice_file.filename != '':
                    if not _allowed_file(voice_file.filename):
                        return jsonify({
                            'error': t('errors.bad_request'),
                            'message': t('errors.invalid_file_type', formats=', '.join(ALLOWED_EXTENSIONS))
                        }), 400
                    voice_files.append(voice_file)

        # Check for at least one voice file
        if not voice_files:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.quick_generate_voice_required')
            }), 400

        # Get text
        text = request.form.get('text', '').strip()
        if not text:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.quick_generate_text_required')
            }), 400

        # Get optional parameters
        import random
        seeds = request.form.get('seeds')
        if seeds:
            try:
                seeds = int(seeds)
            except ValueError:
                seeds = random.randint(0, 2**64 - 1)
        else:
            seeds = random.randint(0, 2**64 - 1)

        batch_size = request.form.get('batch_size', '1')
        try:
            batch_size = int(batch_size)
            batch_size = max(1, min(20, batch_size))  # Clamp to 1-20
        except ValueError:
            batch_size = 1

        cfg_scale = request.form.get('cfg_scale', '1.3')
        try:
            cfg_scale = float(cfg_scale)
        except ValueError:
            cfg_scale = 1.3

        model_dtype = request.form.get('model_dtype', 'bf16')
        attn_implementation = request.form.get('attn_implementation', 'sdpa')

        # Parse offloading config
        offloading_str = request.form.get('offloading', '')
        validated_offloading = None
        if offloading_str:
            try:
                offloading = json.loads(offloading_str)
                validated_offloading = _validate_offloading_config(offloading)
            except (json.JSONDecodeError, ValueError) as e:
                return jsonify({
                    'error': t('errors.validation_error'),
                    'message': str(e)
                }), 400

        # Get service
        service = _get_quick_generate_service()

        # Save voice files
        voice_filenames = []
        for voice_file in voice_files:
            voice_data = voice_file.read()
            voice_filename = service.save_voice_file(voice_data, voice_file.filename)
            voice_filenames.append(voice_filename)

        # Start generation
        quick_gen = service.start_generation(
            voice_files=voice_filenames,
            text=text,
            seeds=seeds,
            batch_size=batch_size,
            cfg_scale=cfg_scale,
            model_dtype=model_dtype,
            attn_implementation=attn_implementation,
            offloading_config=validated_offloading
        )

        if not quick_gen:
            return jsonify({
                'error': t('errors.conflict'),
                'message': t('errors.task_manager_busy')
            }), 409

        return jsonify({
            'message': t('success.quick_generate_started'),
            'request_id': quick_gen.request_id,
            'detected_mode': quick_gen.detected_mode,
            'status': quick_gen.status.value if isinstance(quick_gen.status, InferencePhase) else quick_gen.status
        }), 200

    except Exception as e:
        logger.error(f"Error starting quick generation: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/<request_id>', methods=['GET'])
def get_quick_generation(request_id: str):
    """
    Get status of a quick generation request.

    Args:
        request_id: Generation request ID

    Returns:
        JSON with generation status and details
    """
    try:
        # First check if it's the current running task
        task: Task = gm.get_current_task()
        if task:
            inference = task.unwrap()
            if isinstance(inference, QuickGenerateInferenceBase):
                current_gen = inference.get_quick_generate()
                if current_gen and current_gen.request_id == request_id:
                    gen_dict = current_gen.to_dict()
                    return jsonify(gen_dict), 200

        # Check history
        service = _get_quick_generate_service()
        quick_gen = service.get_generation(request_id)

        if not quick_gen:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_not_found')
            }), 404

        return jsonify(quick_gen.to_dict()), 200

    except Exception as e:
        logger.error(f"Error getting quick generation: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/current', methods=['GET'])
def get_current_quick_generation():
    """
    Get the current running quick generation task.

    Returns:
        JSON with current quick generation status or null if none running
    """
    try:
        task: Task = gm.get_current_task()
        if task:
            inference = task.unwrap()
            if isinstance(inference, QuickGenerateInferenceBase):
                current_gen = inference.get_quick_generate()
                if current_gen:
                    return jsonify({
                        'message': 'Current quick generation retrieved successfully',
                        'generation': current_gen.to_dict()
                    }), 200

        return jsonify({
            'message': 'No active quick generation task',
            'generation': None
        }), 200

    except Exception as e:
        logger.error(f"Error getting current quick generation: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/history', methods=['GET'])
def list_quick_generation_history():
    """
    List quick generation history with pagination.

    Query params:
        - limit: Maximum items (default: 20)
        - offset: Items to skip (default: 0)

    Returns:
        JSON with generations list, count, and total
    """
    try:
        limit = request.args.get('limit', '20')
        offset = request.args.get('offset', '0')

        try:
            limit = int(limit)
            offset = int(offset)
        except ValueError:
            limit = 20
            offset = 0

        service = _get_quick_generate_service()
        result = service.list_history(limit=limit, offset=offset)

        return jsonify(result), 200

    except Exception as e:
        logger.error(f"Error listing quick generation history: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/<request_id>/download', methods=['GET'])
def download_quick_generation_audio(request_id: str):
    """
    Download generated audio file.

    Query params:
        - download: If 'true', force download. Otherwise serve inline.

    Args:
        request_id: Generation request ID

    Returns:
        Audio file or error message
    """
    try:
        service = _get_quick_generate_service()
        audio_path = service.get_audio_path(request_id, item_index=0)

        if not audio_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_not_found')
            }), 404

        force_download = request.args.get('download', 'false').lower() == 'true'

        return send_file(
            str(audio_path),
            mimetype='audio/wav',
            as_attachment=force_download,
            download_name=audio_path.name if force_download else None
        )

    except Exception as e:
        logger.error(f"Error downloading quick generation audio: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/<request_id>/items/<int:item_index>/download', methods=['GET'])
def download_quick_generation_item_audio(request_id: str, item_index: int):
    """
    Download individual audio file from multi-generation batch.

    Query params:
        - download: If 'true', force download. Otherwise serve inline.

    Args:
        request_id: Generation request ID
        item_index: Index of the generation item (0-based)

    Returns:
        Audio file or error message
    """
    try:
        service = _get_quick_generate_service()
        audio_path = service.get_audio_path(request_id, item_index=item_index)

        if not audio_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_item_not_found')
            }), 404

        force_download = request.args.get('download', 'false').lower() == 'true'

        return send_file(
            str(audio_path),
            mimetype='audio/wav',
            as_attachment=force_download,
            download_name=audio_path.name if force_download else None
        )

    except Exception as e:
        logger.error(f"Error downloading quick generation item audio: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/<request_id>', methods=['DELETE'])
def delete_quick_generation(request_id: str):
    """
    Delete a quick generation and its files.

    Args:
        request_id: Generation request ID

    Returns:
        Success message or error
    """
    try:
        service = _get_quick_generate_service()
        success = service.delete_generation(request_id)

        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_not_found')
            }), 404

        return jsonify({
            'message': t('success.quick_generate_deleted'),
            'request_id': request_id
        }), 200

    except Exception as e:
        logger.error(f"Error deleting quick generation: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/quick-generate/<request_id>/voice/preview', methods=['GET'])
def preview_quick_generation_voice(request_id: str):
    """
    Preview the first voice file used in a quick generation.

    Args:
        request_id: Generation request ID

    Returns:
        Voice audio file or error message
    """
    return preview_quick_generation_voice_by_index(request_id, 0)


@api_bp.route('/quick-generate/<request_id>/voice/<int:voice_index>/preview', methods=['GET'])
def preview_quick_generation_voice_by_index(request_id: str, voice_index: int):
    """
    Preview a specific voice file used in a quick generation.

    Args:
        request_id: Generation request ID
        voice_index: Index of the voice file (0-3)

    Returns:
        Voice audio file or error message
    """
    try:
        service = _get_quick_generate_service()
        quick_gen = service.get_generation(request_id)

        if not quick_gen:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_not_found')
            }), 404

        # Check index bounds
        if voice_index < 0 or voice_index >= len(quick_gen.voice_files):
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_voice_not_found')
            }), 404

        voice_path = service.get_voice_path(quick_gen.voice_files[voice_index])
        if not voice_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.quick_generate_voice_not_found')
            }), 404

        return send_file(
            str(voice_path),
            mimetype='audio/wav'
        )

    except Exception as e:
        logger.error(f"Error previewing quick generation voice: {e}", exc_info=True)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/speakers.py
================================================
"""
Speakers API endpoints
"""
from flask import request, jsonify, current_app, send_file
from backend.api import api_bp
from backend.services.speaker_service import SpeakerService
from backend.services.project_service import ProjectService
from backend.services.preset_voice_service import PresetVoiceService
from backend.i18n import t


def get_speaker_service(project_id: str) -> SpeakerService:
    """Get SpeakerService instance for a specific project"""
    # Get project service to find project directory
    project_service = ProjectService(
        workspace_dir=current_app.config['WORKSPACE_DIR'],
        meta_file_name=current_app.config['PROJECTS_META_FILE']
    )

    # Get project path
    project_path = project_service.get_project_path(project_id)
    if not project_path:
        return None

    # Return speaker service for project's voices directory
    return SpeakerService(project_path / 'voices')


@api_bp.route('/projects/<project_id>/speakers', methods=['GET'])
def list_speakers(project_id):
    """
    List all speaker roles for a project

    Args:
        project_id: Project identifier

    Returns:
        JSON response with list of speakers
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        speakers = service.list_speakers()

        return jsonify({
            'speakers': [s.to_dict() for s in speakers],
            'count': len(speakers)
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>', methods=['GET'])
def get_speaker(project_id, speaker_id):
    """
    Get speaker role by ID

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier (e.g., "Speaker 1")

    Returns:
        JSON response with speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        speaker = service.get_speaker(speaker_id)
        if not speaker:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return jsonify(speaker.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers', methods=['POST'])
def add_speaker(project_id):
    """
    Add a new speaker role with voice file

    Args:
        project_id: Project identifier

    Form data:
        description: Speaker description (optional)
        voice_file: Audio file (required)

    Returns:
        JSON response with created speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Get form data
        description = request.form.get('description', '')
        voice_file = request.files.get('voice_file')

        if not voice_file:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.file_upload_error')
            }), 400

        speaker = service.add_speaker(description, voice_file)

        return jsonify(speaker.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>', methods=['PUT'])
def update_speaker(project_id, speaker_id):
    """
    Update speaker role metadata

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier

    Request body:
        {
            "description": "Updated description"
        }

    Returns:
        JSON response with updated speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        description = data.get('description')

        speaker = service.update_speaker(speaker_id, description)
        if not speaker:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return jsonify(speaker.to_dict()), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>', methods=['DELETE'])
def delete_speaker(project_id, speaker_id):
    """
    Delete speaker role and its voice file

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier

    Returns:
        JSON response confirming deletion
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        success = service.delete_speaker(speaker_id)
        if not success:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return jsonify({
            'message': t('success.speaker_deleted'),
            'speaker_id': speaker_id
        }), 200

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>/voice', methods=['GET'])
def download_voice_file(project_id, speaker_id):
    """
    Download speaker's voice file

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier

    Returns:
        Audio file or error
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        voice_file_path = service.get_voice_file_path(speaker_id)
        if not voice_file_path or not voice_file_path.exists():
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return send_file(voice_file_path, as_attachment=True)

    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>/voice', methods=['PUT'])
def update_voice_file(project_id, speaker_id):
    """
    Update speaker's voice file without changing speaker ID

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier

    Form data:
        voice_file: New audio file (required)

    Returns:
        JSON response with updated speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        voice_file = request.files.get('voice_file')
        if not voice_file:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.file_upload_error')
            }), 400

        speaker = service.update_voice_file(speaker_id, voice_file)
        if not speaker:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return jsonify(speaker.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/from-preset', methods=['POST'])
def add_speaker_from_preset(project_id):
    """
    Add a new speaker role from a preset voice

    Args:
        project_id: Project identifier

    Request body (JSON):
        preset_filename: Preset voice filename (e.g., "en-Alice_woman.wav")
        description: Speaker description (optional)

    Returns:
        JSON response with created speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        preset_filename = data.get('preset_filename')
        if not preset_filename:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.preset_filename_required')
            }), 400

        # Get preset file path
        preset_service = PresetVoiceService(current_app.config['PRESET_VOICE_DIR'])
        preset_path = preset_service.get_preset_path(preset_filename)

        if not preset_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.preset_voice_not_found')
            }), 404

        description = data.get('description', '')
        speaker = service.add_speaker_from_preset(description, preset_path)

        return jsonify(speaker.to_dict()), 201

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/speakers/<speaker_id>/voice/trim', methods=['POST'])
def trim_voice_file(project_id, speaker_id):
    """
    Trim speaker's voice file to a specific time range

    Args:
        project_id: Project identifier
        speaker_id: Speaker identifier

    JSON body:
        start_time: Start time in seconds (required)
        end_time: End time in seconds (required)

    Returns:
        JSON response with updated speaker data
    """
    try:
        service = get_speaker_service(project_id)
        if not service:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'JSON body is required'
            }), 400

        start_time = data.get('start_time')
        end_time = data.get('end_time')

        if start_time is None or end_time is None:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.validation_error')
            }), 400

        if start_time < 0 or end_time <= start_time:
            return jsonify({
                'error': t('errors.validation_error'),
                'message': 'Invalid time range: end_time must be greater than start_time and both must be non-negative'
            }), 400

        speaker = service.trim_voice_file(speaker_id, start_time, end_time)
        if not speaker:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.speaker_not_found')
            }), 404

        return jsonify(speaker.to_dict()), 200

    except ValueError as e:
        return jsonify({
            'error': t('errors.validation_error'),
            'message': str(e)
        }), 400
    except Exception as e:
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


================================================
FILE: backend/api/tasks.py
================================================
"""
Unified Tasks API endpoints
Provides a single endpoint to check all running tasks (inference, training, and quick generation)
"""
from flask import jsonify, current_app
from backend.api import api_bp
from backend.inference.inference import InferenceBase
from backend.inference.quick_generate_inference import QuickGenerateInferenceBase
from backend.training.engine import BaseTrainingEngine
from backend.task_manager.task import gm, Task
from backend.services.project_service import ProjectService
from backend.services.speaker_service import SpeakerService
from backend.services.dialog_session_service import DialogSessionService
from backend.i18n import t
from util.logger import get_logger

logger = get_logger(__name__)


def _enrich_generation_with_session_name(generation, dialog_service):
    """
    Enrich generation dict with session_name field.
    """
    gen_dict = generation.to_dict()

    try:
        session = dialog_service.get_session(generation.session_id)
        if session:
            gen_dict['session_name'] = session.name
        else:
            gen_dict['session_name'] = t('session.deleted')
    except Exception as e:
        logger.warning(f"Failed to get session name for session_id {generation.session_id}: {e}")
        gen_dict['session_name'] = t('session.deleted')

    return gen_dict


@api_bp.route('/tasks/current', methods=['GET'])
def get_current_task():
    """
    Get the current running task (either inference or training)

    Returns:
        200: JSON response with task information
        {
            "message": "...",
            "task": {
                "type": "inference" | "training" | null,
                "project_id": "..." | null,
                "data": { ... task-specific data ... } | null
            }
        }
    """
    task: Task = gm.get_current_task()

    if not task:
        return jsonify({
            'message': 'No active task at the moment',
            'task': {
                'type': None,
                'project_id': None,
                'data': None
            }
        }), 200

    unwrapped = task.unwrap()

    # Check if it's an inference task
    if isinstance(unwrapped, InferenceBase):
        inference: InferenceBase = unwrapped
        generation = inference.get_generation()
        project_id = generation.project_id

        # Try to enrich with session name
        gen_dict = None
        try:
            if project_id:
                project_service = ProjectService(
                    workspace_dir=current_app.config['WORKSPACE_DIR'],
                    meta_file_name=current_app.config['PROJECTS_META_FILE']
                )
                project_path = project_service.get_project_path(project_id)

                if project_path:
                    speaker_service = SpeakerService(project_path / 'voices')
                    dialog_service = DialogSessionService(
                        project_path / 'scripts',
                        speaker_service=speaker_service
                    )
                    gen_dict = _enrich_generation_with_session_name(generation, dialog_service)
                else:
                    gen_dict = generation.to_dict()
                    gen_dict['session_name'] = t('session.deleted')
            else:
                gen_dict = generation.to_dict()
                gen_dict['session_name'] = t('session.unknown')
        except Exception as e:
            logger.warning(f"Failed to enrich generation with session name: {e}")
            gen_dict = generation.to_dict()
            gen_dict['session_name'] = t('session.unknown')

        return jsonify({
            'message': 'Current inference task retrieved successfully',
            'task': {
                'type': 'inference',
                'project_id': project_id,
                'data': gen_dict
            }
        }), 200

    # Check if it's a training task
    if isinstance(unwrapped, BaseTrainingEngine):
        training_engine: BaseTrainingEngine = unwrapped
        state = training_engine.state
        project_id = state.project_id

        state_dict = state.to_dict()
        state_dict['all_lora_files'] = state.get_all_lora_files()

        return jsonify({
            'message': 'Current training task retrieved successfully',
            'task': {
                'type': 'training',
                'project_id': project_id,
                'data': state_dict
            }
        }), 200

    # Check if it's a quick generation task
    if isinstance(unwrapped, QuickGenerateInferenceBase):
        quick_gen_inference: QuickGenerateInferenceBase = unwrapped
        quick_gen = quick_gen_inference.get_quick_generate()

        return jsonify({
            'message': 'Current quick generation task retrieved successfully',
            'task': {
                'type': 'quick_generation',
                'project_id': None,  # Quick generation has no project
                'data': quick_gen.to_dict()
            }
        }), 200

    # Unknown task type
    return jsonify({
        'message': 'Unknown task type running',
        'task': {
            'type': 'unknown',
            'project_id': None,
            'data': None
        }
    }), 200


================================================
FILE: backend/api/training.py
================================================
"""
Training API endpoints
"""
from typing import Dict, Any
from flask import request, jsonify, current_app, send_file
from backend.api import api_bp
from backend.services.training_service import TrainingService
from backend.services.project_service import ProjectService
from backend.utils.tensorboard_reader import TensorBoardReader
from backend.i18n import t
from util.logger import get_logger

from vibevoice.training.trainer import TrainConfig

logger = get_logger(__name__)


def _get_training_service(project_id: str) -> tuple:
    """
    Helper function to get TrainingService instance for a project

    Returns:
        Tuple of (TrainingService, 200) or (error_response, error_code)
    """
    try:
        # Get project service
        project_service = ProjectService(
            workspace_dir=current_app.config['WORKSPACE_DIR'],
            meta_file_name=current_app.config['PROJECTS_META_FILE']
        )

        # Get project path
        project_path = project_service.get_project_path(project_id)
        if not project_path:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.project_not_found')
            }), 404

        # Create training service
        training_dir = project_path / 'training'
        service = TrainingService(
            project_training_dir=training_dir,
            project_id=project_id,
            fake_engine=current_app.config.get('FAKE_MODEL', False)
        )

        return service, 200

    except Exception as e:
        logger.error("Error creating training service", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/training', methods=['POST'])
def create_training_job(project_id: str):
    """
    Create and start a new training job

    Request body:
        {
            "job_name": "My Training Job",
            "config": {
                "lora_name": "my_lora",
                "epochs": 10,
                "batch_size": 1,
                ...
            }
        }

    Returns:
        201: Training job created and started (returns TrainingState)
        400: Invalid request
        404: Project not found
        409: Task manager is busy (another task is running)
        500: Internal error
    """
    try:
        data = request.get_json()
        if not data:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Request body must be JSON'
            }), 400

        # Validate required fields
        job_name = data.get('job_name')
        if not job_name:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': t('errors.validation_error')
            }), 400

        config_dict = data.get('config', {})
        if not config_dict:
            return jsonify({
                'error': t('errors.bad_request'),
                'message': 'Training configuration is required'
            }), 400

        # Create TrainConfig from dict
        try:
            train_config = TrainConfig.from_dict(config_dict)
        except Exception as e:
            logger.error(f"Failed to create TrainConfig: {e}")
            return jsonify({
                'error': t('errors.validation_error'),
                'message': f'Invalid training configuration: {str(e)}'
            }), 400

        # Get training service
        result = _get_training_service(project_id)
        if isinstance(result[0], TrainingService):
            service = result[0]
        else:
            return result  # Return error response

        # Create training job (may raise ValueError for duplicate job_name)
        try:
            state = service.create_training_job(job_name, train_config, project_id)
        except ValueError as e:
            # Job name is not unique
            return jsonify({
                'error': t('errors.conflict'),
                'message': t('errors.job_name_duplicate')
            }), 409

        if not state:
            # Task manager is busy
            return jsonify({
                'error': t('errors.conflict'),
                'message': t('errors.task_manager_busy')
            }), 409

        return jsonify({
            'message': t('success.training_started'),
            'task_id': state.task_id,
            'state': state.to_dict()
        }), 201

    except Exception as e:
        logger.error(f"Error creating training job job_name: {job_name} train_config: {train_config.to_dict()}", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/training', methods=['GET'])
def list_training_jobs(project_id: str):
    """
    List all training jobs for a project

    Returns:
        200: List of TrainingState objects
        404: Project not found
        500: Internal error
    """
    try:
        # Get training service
        result = _get_training_service(project_id)
        if isinstance(result[0], TrainingService):
            service = result[0]
        else:
            return result

        # List all jobs
        states = service.list_jobs()
        results = []
        for state in states:
            state_dict = state.to_dict()
            state_dict['all_lora_files'] = state.get_all_lora_files()
            results.append(state_dict)

        return jsonify({
            'states': results,
            'count': len(results)
        }), 200

    except Exception as e:
        logger.error("Error listing training jobs: ", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/training/current', methods=['GET'])
def get_current_training_job(project_id: str):
    """
    Get the currently running training job with live metrics

    Returns:
        200: Current TrainingState (null if none active)
        404: Project not found
        500: Internal error
    """
    try:
        # Get training service
        result = _get_training_service(project_id)
        if isinstance(result[0], TrainingService):
            service = result[0]
        else:
            return result

        # Get current job
        current_state = service.get_current_job()

        if current_state:
            result = current_state.to_dict()
            result['all_lora_files'] = current_state.get_all_lora_files()
            return jsonify({
                'message': 'Current training job retrieved successfully',
                'state': result
            }), 200
        else:
            return jsonify({
                'message': 'No active training job at the moment',
                'state': None
            }), 200

    except Exception as e:
        logger.error("Error getting current training job: ", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/training/<job_id>', methods=['GET'])
def get_training_job(project_id: str, job_id: str):
    """
    Get a specific training job by ID

    Returns:
        200: TrainingState details
        404: Job or project not found
        500: Internal error
    """
    try:
        # Get training service
        result = _get_training_service(project_id)
        if isinstance(result[0], TrainingService):
            service = result[0]
        else:
            return result

        # Get job
        state = service.get_job(job_id)

        if not state:
            return jsonify({
                'error': t('errors.not_found'),
                'message': t('errors.training_job_not_found')
            }), 404

        return jsonify({
            'state': state.to_dict()
        }), 200

    except Exception as e:
        logger.error("Error getting training job:", exc_info=e)
        return jsonify({
            'error': t('errors.internal_error'),
            'message': str(e)
        }), 500


@api_bp.route('/projects/<project_id>/training/<job_id>', methods=['DELETE'])
def delete_training_job(project_id: str, job_id: str):
    """
    Delete a training job (only if not currently running)

    Returns:
        200: Job deleted successfully
        400: Cannot delete running job
        404: Job or project not found
        500: Internal error
  
Download .txt
gitextract__vtvi0gi/

├── .dockerignore
├── .gitignore
├── CHANGELOG.md
├── CHANGELOG_zh.md
├── Dockerfile
├── README.md
├── README_zh.md
├── backend/
│   ├── .gitignore
│   ├── README.md
│   ├── __init__.py
│   ├── api/
│   │   ├── __init__.py
│   │   ├── datasets.py
│   │   ├── dialog_sessions.py
│   │   ├── generation.py
│   │   ├── openai_compat.py
│   │   ├── preset_voices.py
│   │   ├── projects.py
│   │   ├── quick_generate.py
│   │   ├── speakers.py
│   │   ├── tasks.py
│   │   └── training.py
│   ├── app.py
│   ├── config.py
│   ├── i18n/
│   │   ├── __init__.py
│   │   ├── en.json
│   │   └── zh.json
│   ├── inference/
│   │   ├── inference.py
│   │   └── quick_generate_inference.py
│   ├── run.py
│   ├── scripts/
│   │   ├── generate_cantonese_training_dataset.py
│   │   ├── generate_mcv_cantonese_training_dataset.py
│   │   ├── generate_training_dataset.py
│   │   └── migrate_dataset_paths.py
│   ├── services/
│   │   ├── __init__.py
│   │   ├── dataset_service.py
│   │   ├── dialog_session_service.py
│   │   ├── openai_compat_service.py
│   │   ├── preset_voice_service.py
│   │   ├── project_service.py
│   │   ├── quick_generate_service.py
│   │   ├── speaker_service.py
│   │   ├── training_service.py
│   │   └── voice_gerneration_service.py
│   ├── task_manager/
│   │   ├── inference_task.py
│   │   ├── quick_generate_task.py
│   │   ├── task.py
│   │   └── training_task.py
│   ├── training/
│   │   ├── engine.py
│   │   └── state.py
│   └── utils/
│       ├── __init__.py
│       ├── dialog_validator.py
│       ├── file_handler.py
│       └── tensorboard_reader.py
├── compose.yml
├── config/
│   ├── __init__.py
│   └── configuration_vibevoice.py
├── demo/
│   ├── README_AUDIO_DENOISE.md
│   ├── audio_denoise_deepfilter.py
│   ├── audio_denose.py
│   ├── convert_model.py
│   ├── list_modules.py
│   ├── local_file_inference.py
│   ├── train.py
│   ├── verify_dataset.py
│   ├── view_tensorfile.py
│   └── vram_offload_animation.py
├── docs/
│   ├── APIs.md
│   ├── DATASET_PATH_FIX.md
│   ├── DOCKER_REBUILD.md
│   ├── develop_thoughts.md
│   ├── model_components_analysis.md
│   ├── multi-generation-ui-design.md
│   ├── offloading.md
│   ├── openai-compatible-api.md
│   ├── preset-voice-feature.md
│   ├── processor.md
│   ├── quick-generate-feature.md
│   └── vibevoice_inference_architecture.md
├── frontend/
│   ├── .gitignore
│   ├── README.md
│   ├── app/
│   │   ├── dataset/
│   │   │   ├── detail/
│   │   │   │   └── page.tsx
│   │   │   └── page.tsx
│   │   ├── fine-tuning/
│   │   │   └── page.tsx
│   │   ├── generate-voice/
│   │   │   └── page.tsx
│   │   ├── globals.css
│   │   ├── layout.tsx
│   │   ├── page.tsx
│   │   ├── quick-generate/
│   │   │   └── page.tsx
│   │   ├── speaker-role/
│   │   │   └── page.tsx
│   │   └── voice-editor/
│   │       └── page.tsx
│   ├── components/
│   │   ├── AudioPlayer.tsx
│   │   ├── AudioUploader.tsx
│   │   ├── CreateDatasetModal.tsx
│   │   ├── CurrentGeneration.tsx
│   │   ├── CurrentTraining.tsx
│   │   ├── DatasetCard.tsx
│   │   ├── DatasetItemModal.tsx
│   │   ├── DatasetItemRow.tsx
│   │   ├── DialogEditor.tsx
│   │   ├── DialogPreview.tsx
│   │   ├── GenerationForm.tsx
│   │   ├── GenerationHistory.tsx
│   │   ├── ImportDatasetModal.tsx
│   │   ├── InlineAudioPlayer.tsx
│   │   ├── LayoutWrapper.tsx
│   │   ├── Navigation.tsx
│   │   ├── PresetVoiceManager.tsx
│   │   ├── PresetVoiceSelector.tsx
│   │   ├── ProjectSelector.tsx
│   │   ├── QuickGenerateHistory.tsx
│   │   ├── QuickGenerateNavigation.tsx
│   │   ├── SessionManager.tsx
│   │   ├── SpeakerList.tsx
│   │   ├── SpeakerRoleManager.tsx
│   │   ├── SpeakerSelector.tsx
│   │   ├── TextEditor.tsx
│   │   ├── TrainingForm.tsx
│   │   ├── TrainingHistory.tsx
│   │   ├── TrainingMetricsChart.tsx
│   │   ├── VoicePreview.tsx
│   │   └── VoiceRecorder.tsx
│   ├── eslint.config.mjs
│   ├── lib/
│   │   ├── DatasetContext.tsx
│   │   ├── DatasetItemsContext.tsx
│   │   ├── GenerationContext.tsx
│   │   ├── GlobalTaskContext.tsx
│   │   ├── PresetVoiceContext.tsx
│   │   ├── ProjectContext.tsx
│   │   ├── SessionContext.tsx
│   │   ├── SpeakerRoleContext.tsx
│   │   ├── TrainingContext.tsx
│   │   ├── api.ts
│   │   ├── audioUtils.ts
│   │   └── i18n/
│   │       ├── LanguageContext.tsx
│   │       ├── config.ts
│   │       └── locales/
│   │           ├── en.json
│   │           └── zh.json
│   ├── next.config.ts
│   ├── package.json
│   ├── postcss.config.mjs
│   ├── public/
│   │   ├── icon-preview.html
│   │   ├── icon-rect-preview.html
│   │   └── site.webmanifest
│   ├── scripts/
│   │   └── generate-version.js
│   ├── tsconfig.json
│   └── types/
│       ├── dialog.ts
│       ├── generation.ts
│       ├── preset.ts
│       ├── project.ts
│       ├── quickGenerate.ts
│       ├── speaker.ts
│       ├── task.ts
│       └── training.ts
├── pyproject.toml
├── rebuild.sh
├── test_generation_offloading.py
├── test_offloading.py
├── tests/
│   ├── test_logging.py
│   ├── test_lora_network.py
│   └── test_training_service.py
├── tokenizer/
│   ├── tokenizer.json
│   ├── tokenizer_config.json
│   └── vocab.json
├── util/
│   ├── LOGGING_README.md
│   ├── __init__.py
│   ├── float8_scale.py
│   ├── logger.py
│   ├── logger_examples.py
│   ├── model_utils.py
│   ├── rand_init.py
│   ├── safetensors_util.py
│   └── vibevoice_norm.py
└── vibevoice/
    ├── __init__.py
    ├── configs/
    │   ├── qwen2.5_1.5b_64k.json
    │   └── qwen2.5_7b_32k.json
    ├── generation/
    │   ├── __init__.py
    │   └── visitor.py
    ├── lora/
    │   ├── __init__.py
    │   └── lora_network.py
    ├── modular/
    │   ├── __init__.py
    │   ├── adaptive_offload.py
    │   ├── custom_offloading_utils.py
    │   ├── modeling_vibevoice.py
    │   ├── modeling_vibevoice_inference.py
    │   ├── modular_vibevoice_diffusion_head.py
    │   ├── modular_vibevoice_qwen.py
    │   ├── modular_vibevoice_text_tokenizer.py
    │   ├── modular_vibevoice_tokenizer.py
    │   └── streamer.py
    ├── processor/
    │   ├── __init__.py
    │   ├── vibevoice_processor.py
    │   └── vibevoice_tokenizer_processor.py
    ├── schedule/
    │   ├── __init__.py
    │   ├── dpm_solver.py
    │   └── timestep_sampler.py
    ├── scripts/
    │   ├── __init__.py
    │   └── convert_nnscaler_checkpoint_to_transformers.py
    └── training/
        ├── dataset.py
        ├── fake_trainer.py
        ├── summary_visitor.py
        ├── trainer.py
        └── trainer_visitor.py
Download .txt
SYMBOL INDEX (1324 symbols across 145 files)

FILE: backend/api/__init__.py
  function ping (line 11) | def ping():

FILE: backend/api/datasets.py
  function get_dataset_service (line 12) | def get_dataset_service(project_id: str) -> DatasetService:
  function list_datasets (line 30) | def list_datasets(project_id):
  function get_dataset (line 63) | def get_dataset(project_id, dataset_id):
  function create_dataset (line 100) | def create_dataset(project_id):
  function update_dataset (line 157) | def update_dataset(project_id, dataset_id):
  function delete_dataset (line 210) | def delete_dataset(project_id, dataset_id):
  function export_dataset (line 250) | def export_dataset(project_id, dataset_id):
  function import_to_existing_dataset (line 325) | def import_to_existing_dataset(project_id, dataset_id):
  function import_dataset (line 378) | def import_dataset(project_id):
  function list_dataset_items (line 429) | def list_dataset_items(project_id, dataset_id):
  function add_dataset_item (line 485) | def add_dataset_item(project_id, dataset_id):
  function update_dataset_item (line 555) | def update_dataset_item(project_id, dataset_id, item_index):
  function delete_dataset_item (line 618) | def delete_dataset_item(project_id, dataset_id, item_index):
  function get_dataset_audio (line 667) | def get_dataset_audio(project_id, dataset_id, filename):
  function get_dataset_voice_prompt (line 718) | def get_dataset_voice_prompt(project_id, dataset_id, filename):

FILE: backend/api/dialog_sessions.py
  function get_dialog_session_service (line 12) | def get_dialog_session_service(project_id: str) -> DialogSessionService:
  function list_sessions (line 33) | def list_sessions(project_id):
  function get_session (line 66) | def get_session(project_id, session_id):
  function create_session (line 102) | def create_session(project_id):
  function update_session (line 169) | def update_session(project_id, session_id):
  function delete_session (line 235) | def delete_session(project_id, session_id):
  function get_session_text (line 274) | def get_session_text(project_id, session_id):
  function download_session_text (line 313) | def download_session_text(project_id, session_id):

FILE: backend/api/generation.py
  function _enrich_generation_with_session_name (line 21) | def _enrich_generation_with_session_name(generation: Generation, dialog_...
  function _validate_offloading_config (line 51) | def _validate_offloading_config(offloading: dict) -> dict:
  function get_voice_generation_service (line 102) | def get_voice_generation_service(project_id: str):
  function get_current_generation (line 189) | def get_current_generation():
  function get_current_generation_for_project (line 235) | def get_current_generation_for_project(project_id: str):
  function get_all_generations (line 286) | def get_all_generations(project_id):
  function download_generation_audio (line 345) | def download_generation_audio(project_id: str, request_id: str):
  function download_generation_item_audio (line 411) | def download_generation_item_audio(project_id: str, request_id: str, ite...
  function get_generation (line 518) | def get_generation(project_id: str, request_id: str):
  function delete_generation (line 568) | def delete_generation(project_id: str, request_id: str):
  function batch_delete_generations (line 617) | def batch_delete_generations(project_id: str):

FILE: backend/api/openai_compat.py
  function _openai_error (line 20) | def _openai_error(message: str, error_type: str = "invalid_request_error",
  function _get_service (line 34) | def _get_service() -> OpenAICompatService:
  function create_speech (line 44) | def create_speech():
  function list_models (line 156) | def list_models():

FILE: backend/api/preset_voices.py
  function get_preset_service (line 13) | def get_preset_service() -> PresetVoiceService:
  function list_preset_voices (line 19) | def list_preset_voices():
  function add_preset_voice (line 73) | def add_preset_voice():
  function get_preset_voice (line 141) | def get_preset_voice(filename):
  function delete_preset_voice (line 172) | def delete_preset_voice(filename):
  function batch_delete_preset_voices (line 205) | def batch_delete_preset_voices():
  function list_preset_languages (line 250) | def list_preset_languages():
  function preview_preset_voice (line 267) | def preview_preset_voice(filename):

FILE: backend/api/projects.py
  function get_project_service (line 11) | def get_project_service() -> ProjectService:
  function validate_project_name (line 19) | def validate_project_name(name: str) -> tuple[bool, str]:
  function list_projects (line 53) | def list_projects():
  function get_project (line 77) | def get_project(project_id):
  function create_project (line 107) | def create_project():
  function update_project (line 164) | def update_project(project_id):
  function delete_project (line 225) | def delete_project(project_id):

FILE: backend/api/quick_generate.py
  function _allowed_file (line 22) | def _allowed_file(filename: str) -> bool:
  function _get_quick_generate_service (line 27) | def _get_quick_generate_service() -> QuickGenerateService:
  function _validate_offloading_config (line 35) | def _validate_offloading_config(offloading: dict) -> dict:
  function start_quick_generation (line 84) | def start_quick_generation():
  function get_quick_generation (line 214) | def get_quick_generation(request_id: str):
  function get_current_quick_generation (line 256) | def get_current_quick_generation():
  function list_quick_generation_history (line 289) | def list_quick_generation_history():
  function download_quick_generation_audio (line 325) | def download_quick_generation_audio(request_id: str):
  function download_quick_generation_item_audio (line 366) | def download_quick_generation_item_audio(request_id: str, item_index: int):
  function delete_quick_generation (line 408) | def delete_quick_generation(request_id: str):
  function preview_quick_generation_voice (line 442) | def preview_quick_generation_voice(request_id: str):
  function preview_quick_generation_voice_by_index (line 456) | def preview_quick_generation_voice_by_index(request_id: str, voice_index...

FILE: backend/api/speakers.py
  function get_speaker_service (line 12) | def get_speaker_service(project_id: str) -> SpeakerService:
  function list_speakers (line 30) | def list_speakers(project_id):
  function get_speaker (line 63) | def get_speaker(project_id, speaker_id):
  function add_speaker (line 99) | def add_speaker(project_id):
  function update_speaker (line 148) | def update_speaker(project_id, speaker_id):
  function delete_speaker (line 198) | def delete_speaker(project_id, speaker_id):
  function download_voice_file (line 237) | def download_voice_file(project_id, speaker_id):
  function update_voice_file (line 273) | def update_voice_file(project_id, speaker_id):
  function add_speaker_from_preset (line 324) | def add_speaker_from_preset(project_id):
  function trim_voice_file (line 388) | def trim_voice_file(project_id, speaker_id):

FILE: backend/api/tasks.py
  function _enrich_generation_with_session_name (line 20) | def _enrich_generation_with_session_name(generation, dialog_service):
  function get_current_task (line 40) | def get_current_task():

FILE: backend/api/training.py
  function _get_training_service (line 18) | def _get_training_service(project_id: str) -> tuple:
  function create_training_job (line 59) | def create_training_job(project_id: str):
  function list_training_jobs (line 153) | def list_training_jobs(project_id: str):
  function get_current_training_job (line 192) | def get_current_training_job(project_id: str):
  function get_training_job (line 234) | def get_training_job(project_id: str, job_id: str):
  function delete_training_job (line 273) | def delete_training_job(project_id: str, job_id: str):
  function batch_delete_training_jobs (line 322) | def batch_delete_training_jobs(project_id: str):
  function download_lora_file (line 379) | def download_lora_file(project_id: str, job_id: str, filename: str):
  function list_lora_files (line 423) | def list_lora_files(project_id: str):
  function get_training_metrics (line 458) | def get_training_metrics(project_id: str, job_id: str):

FILE: backend/app.py
  function create_app (line 15) | def create_app(config_name='production') -> Flask:
  function register_error_handlers (line 85) | def register_error_handlers(app):
  function register_blueprints (line 132) | def register_blueprints(app):

FILE: backend/config.py
  class Config (line 10) | class Config:
  class DevelopmentConfig (line 45) | class DevelopmentConfig(Config):
  class ProductionConfig (line 52) | class ProductionConfig(Config):
  class TestingConfig (line 59) | class TestingConfig(Config):
  function get_config (line 74) | def get_config(env=None):

FILE: backend/i18n/__init__.py
  function load_translations (line 13) | def load_translations():
  function get_locale (line 28) | def get_locale():
  function translate (line 81) | def translate(key: str, **params) -> str:
  function with_locale (line 130) | def with_locale(f):

FILE: backend/inference/inference.py
  class FakeModel (line 52) | class FakeModel:
    method __init__ (line 53) | def __init__(self):
    method generate (line 56) | def generate(self, **kwargs) -> Union[torch.LongTensor, VibeVoiceGener...
  class InferenceBase (line 67) | class InferenceBase(ABC):
    method __init__ (line 68) | def __init__(self, generation: Generation, speaker_service: SpeakerSer...
    method get_generation (line 88) | def get_generation(self) -> Generation:
    method create (line 92) | def create(generation: Generation, speaker_service: SpeakerService,
    method _load_model (line 148) | def _load_model(self, dtype: torch.dtype, config: str = None):
    method failure (line 151) | def failure(self, message: str, failure_type: str):
    method _save_audio (line 155) | def _save_audio(self, outputs: Union[torch.LongTensor, VibeVoiceGenera...
    method run_inference (line 163) | def run_inference(self):
    method success (line 260) | def success(self, message: str):
    method generation_info (line 263) | def generation_info(self) -> Dict[str, Any]:
    method finalize (line 266) | def finalize(self):
  class InferenceEngine (line 296) | class InferenceEngine(InferenceBase):
    method __init__ (line 297) | def __init__(self, generation, speaker_service, dialog_service, meta_f...
    method _load_model (line 314) | def _load_model(self, dtype: torch.dtype, config: str = None):
    method _collect_offloading_metrics (line 351) | def _collect_offloading_metrics(self, generation_time: float) -> Optio...
    method _save_audio (line 389) | def _save_audio(self, outputs: Union[torch.LongTensor, VibeVoiceGenera...
  class FakeInferenceEngine (line 435) | class FakeInferenceEngine(InferenceBase):
    method __init__ (line 436) | def __init__(self, generation, speaker_service, dialog_service, meta_f...
    method _load_model (line 453) | def _load_model(self, dtype: torch.dtype, config: str = None):
    method _generate_fake_offloading_metrics (line 461) | def _generate_fake_offloading_metrics(self, generation_time: float, nu...
    method _save_audio (line 529) | def _save_audio(self, outputs: Union[torch.LongTensor, VibeVoiceGenera...

FILE: backend/inference/quick_generate_inference.py
  function _get_model_classes (line 24) | def _get_model_classes():
  function _get_offload_presets (line 32) | def _get_offload_presets():
  class FakeQuickGenerateModel (line 59) | class FakeQuickGenerateModel:
    method generate (line 62) | def generate(self, **kwargs) -> Any:
  class QuickGenerateVisitor (line 74) | class QuickGenerateVisitor:
    method __init__ (line 77) | def __init__(self, quick_gen: QuickGenerate):
    method visit_preprocessing (line 81) | def visit_preprocessing(self, timestamp: float = None):
    method visit_inference_start (line 86) | def visit_inference_start(self, scripts: List[str] = None, **kwargs):
    method visit_inference_batch_start (line 92) | def visit_inference_batch_start(self, batch_index: int, seeds: int):
    method visit_inference_batch_end (line 106) | def visit_inference_batch_end(self, batch_index: int):
    method visit_inference_save_audio_file (line 112) | def visit_inference_save_audio_file(self, output_audio_path: str = None,
    method visit_inference_step_start (line 130) | def visit_inference_step_start(self, current_step: int, total_steps: i...
    method visit_inference_step_end (line 144) | def visit_inference_step_end(self, current_step: int, total_steps: int):
    method visit_completed (line 147) | def visit_completed(self, message: str = None):
    method visit_failed (line 153) | def visit_failed(self, message: str, failure_type: str = None):
  class QuickGenerateInferenceBase (line 159) | class QuickGenerateInferenceBase(ABC):
    method __init__ (line 162) | def __init__(self, quick_gen: QuickGenerate, voice_paths: List[str], o...
    method get_quick_generate (line 179) | def get_quick_generate(self) -> QuickGenerate:
    method create (line 183) | def create(quick_gen: QuickGenerate, voice_paths: List[str], output_di...
    method _load_model (line 233) | def _load_model(self, dtype: torch.dtype):
    method _save_audio (line 237) | def _save_audio(self, outputs, processor, generation_time: float, inpu...
    method failure (line 241) | def failure(self, message: str, failure_type: str = None):
    method success (line 244) | def success(self, message: str = None):
    method generation_info (line 247) | def generation_info(self) -> Dict[str, Any]:
    method _prepare_script_and_voices (line 250) | def _prepare_script_and_voices(self) -> tuple:
    method _convert_narration_to_script (line 288) | def _convert_narration_to_script(self, text: str) -> str:
    method run_inference (line 322) | def run_inference(self):
    method finalize (line 395) | def finalize(self):
  class QuickGenerateInferenceEngine (line 426) | class QuickGenerateInferenceEngine(QuickGenerateInferenceBase):
    method __init__ (line 429) | def __init__(self, quick_gen: QuickGenerate, voice_paths: List[str], o...
    method _load_model (line 435) | def _load_model(self, dtype: torch.dtype):
    method _save_audio (line 465) | def _save_audio(self, outputs, processor, generation_time: float, inpu...
  class FakeQuickGenerateInferenceEngine (line 491) | class FakeQuickGenerateInferenceEngine(QuickGenerateInferenceBase):
    method __init__ (line 494) | def __init__(self, quick_gen: QuickGenerate, voice_paths: List[str], o...
    method _load_model (line 499) | def _load_model(self, dtype: torch.dtype):
    method _save_audio (line 506) | def _save_audio(self, outputs, processor, generation_time: float, inpu...

FILE: backend/run.py
  function main (line 20) | def main():

FILE: backend/scripts/generate_cantonese_training_dataset.py
  function parse_metadata_file (line 19) | def parse_metadata_file(file_path: str) -> List[Dict[str, str]]:
  function group_by_speaker (line 56) | def group_by_speaker(utterances: List[Dict[str, str]]) -> Dict[str, List...
  function select_voice_prompts (line 75) | def select_voice_prompts(
  function get_wav_path (line 102) | def get_wav_path(meta_file_path: str, speaker_id: str, uttrans_id: str) ...
  function generate_dataset (line 118) | def generate_dataset(
  function write_dataset_jsonl (line 213) | def write_dataset_jsonl(dataset: List[Dict], output_path: str):
  function generate_copy_script (line 222) | def generate_copy_script(
  function main (line 275) | def main():

FILE: backend/scripts/generate_mcv_cantonese_training_dataset.py
  function parse_tsv_file (line 19) | def parse_tsv_file(file_path: str) -> List[Dict[str, str]]:
  function group_by_speaker (line 60) | def group_by_speaker(utterances: List[Dict[str, str]]) -> Dict[str, List...
  function select_voice_prompts (line 79) | def select_voice_prompts(
  function generate_dataset (line 129) | def generate_dataset(
  function write_dataset_jsonl (line 234) | def write_dataset_jsonl(dataset: List[Dict], output_path: str):
  function generate_copy_script (line 243) | def generate_copy_script(
  function main (line 295) | def main():

FILE: backend/scripts/generate_training_dataset.py
  function parse_metadata_file (line 17) | def parse_metadata_file(file_path: str, num_columns: int = 2) -> Dict[st...
  function parse_filename (line 40) | def parse_filename(filename: str) -> Tuple[str, str]:
  function group_by_person_and_dialect (line 56) | def group_by_person_and_dialect(
  function select_voice_prompts (line 89) | def select_voice_prompts(
  function generate_dataset (line 130) | def generate_dataset(
  function write_dataset_jsonl (line 249) | def write_dataset_jsonl(dataset: List[Dict], output_path: str):
  function generate_copy_script (line 258) | def generate_copy_script(
  function main (line 303) | def main():

FILE: backend/scripts/migrate_dataset_paths.py
  function migrate_dataset_file (line 15) | def migrate_dataset_file(jsonl_path: Path) -> None:
  function main (line 86) | def main():

FILE: backend/services/dataset_service.py
  class DatasetService (line 15) | class DatasetService:
    method __init__ (line 24) | def __init__(self, project_datasets_dir: Path):
    method _load_metadata (line 42) | def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
    method _save_metadata (line 56) | def _save_metadata(self, metadata: Dict[str, Dict[str, Any]]) -> None:
    method _generate_dataset_id (line 68) | def _generate_dataset_id(self, name: str) -> str:
    method _get_dataset_dir (line 89) | def _get_dataset_dir(self, dataset_id: str) -> Path:
    method _get_items_file_path (line 93) | def _get_items_file_path(self, dataset_id: str) -> Path:
    method _get_audio_dir (line 97) | def _get_audio_dir(self, dataset_id: str) -> Path:
    method _get_voice_prompts_dir (line 101) | def _get_voice_prompts_dir(self, dataset_id: str) -> Path:
    method _load_items (line 105) | def _load_items(self, dataset_id: str) -> List[DatasetItem]:
    method _save_items (line 136) | def _save_items(self, dataset_id: str, items: List[DatasetItem]) -> None:
    method _sync_item_count (line 162) | def _sync_item_count(self, dataset_id: str) -> None:
    method _validate_audio_file (line 184) | def _validate_audio_file(self, filename: str) -> bool:
    method _extract_filename_from_path (line 197) | def _extract_filename_from_path(self, path: str) -> str:
    method list_datasets (line 209) | def list_datasets(self) -> List[Dataset]:
    method get_dataset (line 219) | def get_dataset(self, dataset_id: str) -> Optional[Dataset]:
    method create_dataset (line 236) | def create_dataset(self, name: str, description: str = "") -> Dataset:
    method update_dataset (line 284) | def update_dataset(self, dataset_id: str, name: Optional[str] = None,
    method delete_dataset (line 313) | def delete_dataset(self, dataset_id: str) -> bool:
    method list_items (line 341) | def list_items(self, dataset_id: str, offset: int = 0, limit: Optional...
    method add_item (line 379) | def add_item(self, dataset_id: str, text: str, audio_file: FileStorage,
    method update_item (line 472) | def update_item(self, dataset_id: str, item_index: int, text: Optional...
    method delete_item (line 598) | def delete_item(self, dataset_id: str, item_index: int) -> bool:
    method export_dataset (line 647) | def export_dataset(self, dataset_id: str, export_path: Path) -> None:
    method import_dataset (line 683) | def import_dataset(self, import_file: FileStorage, dataset_name: Optio...
    method import_to_existing_dataset (line 785) | def import_to_existing_dataset(self, dataset_id: str, import_file: Fil...

FILE: backend/services/dialog_session_service.py
  class DialogSessionService (line 16) | class DialogSessionService:
    method __init__ (line 21) | def __init__(self, project_scripts_dir: Path, speaker_service=None):
    method _load_metadata (line 41) | def _load_metadata(self) -> List[Dict[str, Any]]:
    method _save_metadata (line 59) | def _save_metadata(self, sessions: List[Dict[str, Any]]) -> None:
    method _get_valid_speaker_ids (line 71) | def _get_valid_speaker_ids(self) -> Set[str]:
    method list_sessions (line 85) | def list_sessions(self) -> List[DialogSession]:
    method get_session (line 95) | def get_session(self, session_id: str) -> Optional[DialogSession]:
    method create_session (line 111) | def create_session(self, name: str, description: str, dialog_text: str,
    method update_session (line 198) | def update_session(self, session_id: str, name: Optional[str] = None,
    method delete_session (line 293) | def delete_session(self, session_id: str) -> bool:
    method get_session_text (line 333) | def get_session_text(self, session_id: str) -> Optional[str]:
    method get_text_file_path (line 357) | def get_text_file_path(self, session_id: str) -> Optional[Path]:
    method parse_session_txt_script (line 372) | def parse_session_txt_script(self, session_id: str) -> Tuple[str, List...
    method parse_narration_script (line 428) | def parse_narration_script(self, session_id: str) -> Tuple[str, List[s...

FILE: backend/services/openai_compat_service.py
  class OpenAICompatService (line 47) | class OpenAICompatService:
    method __init__ (line 50) | def __init__(self, workspace_dir: Path, preset_dir: Path, fake_model: ...
    method validate_api_key (line 57) | def validate_api_key(self, auth_header: Optional[str]) -> bool:
    method resolve_voice (line 79) | def resolve_voice(self, voice_name: str) -> Tuple[Optional[str], Optio...
    method resolve_model (line 101) | def resolve_model(self, model_name: str) -> Tuple[Optional[str], Optio...
    method generate_speech (line 115) | def generate_speech(self, text: str, voice_filename: str, model_dtype:...
    method _convert_audio (line 191) | def _convert_audio(self, source_path: Path, target_format: str) -> Opt...
    method get_available_voices (line 236) | def get_available_voices(self) -> list:

FILE: backend/services/preset_voice_service.py
  class PresetVoiceService (line 19) | class PresetVoiceService:
    method __init__ (line 33) | def __init__(self, preset_dir: Path):
    method _validate_audio_file (line 44) | def _validate_audio_file(self, filename: str) -> bool:
    method _validate_language (line 49) | def _validate_language(self, language: str) -> bool:
    method _validate_gender (line 53) | def _validate_gender(self, gender: str) -> bool:
    method _validate_name (line 57) | def _validate_name(self, name: str) -> bool:
    method _scan_presets (line 61) | def _scan_presets(self, locale: str = 'en') -> List[PresetVoice]:
    method list_presets (line 85) | def list_presets(
    method get_preset (line 128) | def get_preset(self, filename: str, locale: str = 'en') -> Optional[Pr...
    method get_preset_path (line 144) | def get_preset_path(self, filename: str) -> Optional[Path]:
    method add_preset (line 159) | def add_preset(
    method _convert_to_wav (line 245) | def _convert_to_wav(self, source_path: Path, target_path: Path) -> None:
    method delete_preset (line 273) | def delete_preset(self, filename: str) -> bool:
    method batch_delete_presets (line 299) | def batch_delete_presets(self, filenames: List[str]) -> Tuple[List[str...
    method get_available_languages (line 323) | def get_available_languages(self, locale: str = 'en') -> List[Dict[str...

FILE: backend/services/project_service.py
  class ProjectService (line 12) | class ProjectService:
    method __init__ (line 15) | def __init__(self, workspace_dir: Path, meta_file_name: str = 'project...
    method _load_metadata (line 34) | def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
    method _save_metadata (line 48) | def _save_metadata(self, metadata: Dict[str, Dict[str, Any]]) -> None:
    method _generate_project_id (line 60) | def _generate_project_id(self, name: str) -> str:
    method list_projects (line 81) | def list_projects(self) -> List[Project]:
    method get_project (line 91) | def get_project(self, project_id: str) -> Optional[Project]:
    method create_project (line 108) | def create_project(self, name: str, description: str = "") -> Project:
    method update_project (line 153) | def update_project(self, project_id: str, name: Optional[str] = None,
    method delete_project (line 182) | def delete_project(self, project_id: str) -> bool:
    method get_project_path (line 210) | def get_project_path(self, project_id: str) -> Optional[Path]:

FILE: backend/services/quick_generate_service.py
  class QuickGenerateService (line 25) | class QuickGenerateService:
    method __init__ (line 32) | def __init__(self, workspace_dir: Path, fake_model: bool = False):
    method _ensure_directories (line 51) | def _ensure_directories(self):
    method _load_history (line 61) | def _load_history(self) -> List[Dict[str, Any]]:
    method _save_history (line 74) | def _save_history(self, history: List[Dict[str, Any]]) -> None:
    method _add_to_history (line 81) | def _add_to_history(self, quick_gen: QuickGenerate) -> None:
    method _update_history (line 87) | def _update_history(self, request_id: str, updates: Dict[str, Any]) ->...
    method save_voice_file (line 97) | def save_voice_file(self, file_data: bytes, original_filename: str) ->...
    method start_generation (line 123) | def start_generation(self, voice_files: List[str], text: str,
    method get_generation (line 197) | def get_generation(self, request_id: str) -> Optional[QuickGenerate]:
    method list_history (line 226) | def list_history(self, limit: int = 20, offset: int = 0) -> Dict[str, ...
    method delete_generation (line 263) | def delete_generation(self, request_id: str) -> bool:
    method get_audio_path (line 309) | def get_audio_path(self, request_id: str, item_index: int = 0) -> Opti...
    method get_voice_path (line 353) | def get_voice_path(self, voice_filename: str) -> Optional[Path]:
    method cleanup_old_data (line 368) | def cleanup_old_data(self, voice_days: int = 7, output_days: int = 30,...

FILE: backend/services/speaker_service.py
  class SpeakerService (line 16) | class SpeakerService:
    method __init__ (line 22) | def __init__(self, project_voices_dir: Path):
    method _load_metadata (line 40) | def _load_metadata(self) -> List[Dict[str, Any]]:
    method _save_metadata (line 58) | def _save_metadata(self, speakers: List[Dict[str, Any]]) -> None:
    method _generate_speaker_id (line 70) | def _generate_speaker_id(self, index: int) -> str:
    method _reindex_speakers (line 82) | def _reindex_speakers(self, speakers: List[SpeakerRole]) -> List[Speak...
    method _validate_audio_file (line 96) | def _validate_audio_file(self, filename: str) -> bool:
    method list_speakers (line 109) | def list_speakers(self) -> List[SpeakerRole]:
    method get_speaker (line 119) | def get_speaker(self, speaker_id: str) -> Optional[SpeakerRole]:
    method add_speaker (line 135) | def add_speaker(self, description: str, voice_file: FileStorage) -> Sp...
    method update_speaker (line 188) | def update_speaker(self, speaker_id: str, description: Optional[str] =...
    method update_voice_file (line 222) | def update_voice_file(self, speaker_id: str, voice_file: FileStorage) ...
    method trim_voice_file (line 289) | def trim_voice_file(self, speaker_id: str, start_time: float, end_time...
    method delete_speaker (line 393) | def delete_speaker(self, speaker_id: str) -> bool:
    method get_voice_file_path (line 436) | def get_voice_file_path(self, speaker_id: str) -> Optional[Path]:
    method get_speakers_filepath (line 451) | def get_speakers_filepath(self, speaker_names: List[str]) -> List[str]:
    method add_speaker_from_preset (line 468) | def add_speaker_from_preset(

FILE: backend/services/training_service.py
  class TrainingService (line 21) | class TrainingService:
    method __init__ (line 26) | def __init__(self, project_training_dir: Path, project_id: str = None,...
    method _save_metadata (line 48) | def _save_metadata(self, data: dict) -> None:
    method _load_metadata (line 52) | def _load_metadata(self) -> dict:
    method _is_job_name_unique (line 70) | def _is_job_name_unique(self, job_name: str) -> bool:
    method create_training_job (line 86) | def create_training_job(self, job_name: str, train_config: TrainConfig,
    method list_jobs (line 154) | def list_jobs(self) -> List[TrainingState]:
    method get_job (line 204) | def get_job(self, job_id: str) -> Optional[TrainingState]:
    method get_current_job (line 226) | def get_current_job(self) -> Optional[TrainingState]:
    method get_lora_file_path (line 284) | def get_lora_file_path(self, job_id: str, filename: str) -> Optional[P...
    method delete_job (line 311) | def delete_job(self, job_id: str) -> bool:
    method list_available_lora_files (line 354) | def list_available_lora_files(self) -> List[dict]:
    method delete_jobs_batch (line 396) | def delete_jobs_batch(self, job_ids: List[str]) -> dict:

FILE: backend/services/voice_gerneration_service.py
  class VoiceGenerationService (line 12) | class VoiceGenerationService:
    method __init__ (line 16) | def __init__(self, project_generation_dir: Path, speaker_service: Spea...
    method _load_metadata (line 38) | def _load_metadata(self) -> List[Dict[str, Any]]:
    method _save_metadata (line 56) | def _save_metadata(self, generations: List[Dict[str, Any]]) -> None:
    method list_generations (line 68) | def list_generations(self) -> List[Generation]:
    method delete_generation (line 78) | def delete_generation(self, request_id: str) -> bool:
    method delete_generations_batch (line 111) | def delete_generations_batch(self, request_ids: List[str]) -> Dict[str...
    method generation (line 156) | def generation(self, dialog_session_id: str, request_id: str,

FILE: backend/task_manager/inference_task.py
  class InferenceTask (line 12) | class InferenceTask(Task):
    method __init__ (line 14) | def __init__(self, inference: InferenceBase, file_handler: FileHandler...
    method from_inference (line 21) | def from_inference(cls, inference: InferenceBase, file_handler: FileHa...
    method run (line 24) | def run(self):
    method task_failure (line 29) | def task_failure(self, error_msg: str, failure_type: str = FAILURE_TYP...
    method task_success (line 33) | def task_success(self, message: str):
    method unwrap (line 37) | def unwrap(self) -> InferenceBase:
    method task_appended (line 40) | def task_appended(self, message: str):
    method _task_finalize (line 45) | def _task_finalize(self):
    method _load_metadata (line 54) | def _load_metadata(self) -> List[Dict[str, Any]]:
    method _save_metadata (line 72) | def _save_metadata(self, generations: List[Dict[str, Any]]) -> None:
    method _update_metadata (line 84) | def _update_metadata(self, generation_dict: Dict[str, Any]) -> None:

FILE: backend/task_manager/quick_generate_task.py
  class QuickGenerateTask (line 15) | class QuickGenerateTask(Task):
    method __init__ (line 18) | def __init__(self, inference: QuickGenerateInferenceBase, file_handler...
    method from_inference (line 25) | def from_inference(cls, inference: QuickGenerateInferenceBase, file_ha...
    method run (line 29) | def run(self):
    method task_failure (line 33) | def task_failure(self, error_msg: str, failure_type: str = FAILURE_TYP...
    method task_success (line 37) | def task_success(self, message: str):
    method unwrap (line 41) | def unwrap(self) -> QuickGenerateInferenceBase:
    method task_appended (line 44) | def task_appended(self, message: str):
    method _task_finalize (line 49) | def _task_finalize(self):
    method _load_history (line 58) | def _load_history(self) -> List[Dict[str, Any]]:
    method _save_history (line 70) | def _save_history(self, history: List[Dict[str, Any]]) -> None:
    method _update_history (line 77) | def _update_history(self, gen_dict: Dict[str, Any]) -> None:

FILE: backend/task_manager/task.py
  class Task (line 16) | class Task(ABC):
    method __init__ (line 18) | def __init__(self, task_id: str = None):
    method id (line 21) | def id(self) -> str:
    method run (line 25) | def run(self):
    method task_failure (line 29) | def task_failure(self, error_msg: str, failure_type: str = FAILURE_TYP...
    method task_success (line 33) | def task_success(self, message: str):
    method task_appended (line 37) | def task_appended(self, message: str):
    method unwrap (line 41) | def unwrap(self) -> Any:
    method task_finalize (line 44) | def task_finalize(self):
    method _task_finalize (line 52) | def _task_finalize(self):
  class Manager (line 56) | class Manager:
    method __init__ (line 57) | def __init__(self):
    method task_run_loop (line 60) | def task_run_loop(self):
    method add_task (line 81) | def add_task(self, task: Task) -> bool:
    method get_current_task (line 92) | def get_current_task(self) -> Task:
    method has_task (line 97) | def has_task(self) -> bool:

FILE: backend/task_manager/training_task.py
  class TrainingTask (line 9) | class TrainingTask(Task):
    method __init__ (line 11) | def __init__(self, task_id: str, training_engine: BaseTrainingEngine):
    method from_engine (line 16) | def from_engine(cls, task_id: str, training_engine: BaseTrainingEngine...
    method run (line 19) | def run(self):
    method task_failure (line 22) | def task_failure(self, error_msg: str, failure_type: str = FAILURE_TYP...
    method task_success (line 26) | def task_success(self, message: str):
    method unwrap (line 30) | def unwrap(self) -> BaseTrainingEngine:
    method task_appended (line 33) | def task_appended(self, message: str):
    method _task_finalize (line 36) | def _task_finalize(self):

FILE: backend/training/engine.py
  class BaseTrainingEngine (line 13) | class BaseTrainingEngine(TrainerVisitor):
    method __init__ (line 15) | def __init__(self, trainer: Trainer, task_id: str, state_writer: Train...
    method train (line 27) | def train(self):
    method visit_training_begin (line 30) | def visit_training_begin(self, timestamp: float, batch_size: int, tota...
    method visit_training_end (line 41) | def visit_training_end(self, timestamp: float, loss: float, diffusion_...
    method visit_step_begin (line 53) | def visit_step_begin(self, timestamp: float, step: int, epoch: int,
    method visit_step_end (line 61) | def visit_step_end(self, timestamp: float, step: int, epoch: int, step...
    method visit_epoch_begin (line 85) | def visit_epoch_begin(self, timestamp: float, epoch: int, lr: float):
    method visit_epoch_end (line 90) | def visit_epoch_end(self, timestamp: float, epoch: int, lr: float, avg...
    method visit_training_failed (line 108) | def visit_training_failed(self, timestamp, error_msg: str, failure_typ...
    method get_state (line 114) | def get_state(self) -> TrainingState:
    method visit_lora_file_saved (line 117) | def visit_lora_file_saved(self, lora_file):
    method visit_final_lora_file_saved (line 120) | def visit_final_lora_file_saved(self, lora_file):
    method finalize (line 123) | def finalize(self):
  class TrainingEngine (line 127) | class TrainingEngine(BaseTrainingEngine):
    method __init__ (line 137) | def __init__(self,
    method finalize (line 155) | def finalize(self):
  class FakeTrainingEngine (line 160) | class FakeTrainingEngine(BaseTrainingEngine):
    method __init__ (line 165) | def __init__(self,

FILE: backend/training/state.py
  class TrainingState (line 9) | class TrainingState:
    method get_all_lora_files (line 60) | def get_all_lora_files(self) -> List[str]:
    method to_dict (line 72) | def to_dict(self) -> Dict[str, Any]:
    method from_dict (line 84) | def from_dict(cls, data: Dict[str, Any]) -> 'TrainingState':
  class TrainingStateWriter (line 108) | class TrainingStateWriter:
    method __init__ (line 110) | def __init__(self, file_handler: FileHandler, task_id: str, path: str):
    method update_state (line 115) | def update_state(self, train_state: TrainingState) -> None:

FILE: backend/utils/dialog_validator.py
  class DialogValidator (line 9) | class DialogValidator:
    method parse_narration_text (line 16) | def parse_narration_text(text: str) -> List[str]:
    method validate_narration_text (line 52) | def validate_narration_text(text: str, narrator_speaker_id: str,
    method convert_narration_to_dialog (line 80) | def convert_narration_to_dialog(text: str, narrator_speaker_id: str) -...
    method parse_dialog_text (line 95) | def parse_dialog_text(text: str) -> List[Tuple[str, str]]:
    method extract_speaker_ids (line 146) | def extract_speaker_ids(text: str) -> Set[str]:
    method validate_speaker_ids (line 163) | def validate_speaker_ids(text: str, valid_speaker_ids: Set[str]) -> Tu...
    method format_dialog_text (line 187) | def format_dialog_text(dialogs: List[Tuple[str, str]]) -> str:
    method read_and_validate_file (line 206) | def read_and_validate_file(file_path: Path, valid_speaker_ids: Set[str...

FILE: backend/utils/file_handler.py
  class DateTimeEncoder (line 11) | class DateTimeEncoder(json.JSONEncoder):
    method default (line 14) | def default(self, obj):
  class FileHandler (line 20) | class FileHandler:
    method ensure_directory (line 24) | def ensure_directory(path: Path) -> None:
    method read_json (line 34) | def read_json(file_path: Path) -> Dict[str, Any]:
    method write_json (line 52) | def write_json(file_path: Path, data: Dict[str, Any], indent: int = 2)...
    method write_json_atomic (line 68) | def write_json_atomic(file_path: Path, data: Dict[str, Any], indent: i...
    method delete_directory (line 108) | def delete_directory(path: Path, ignore_errors: bool = False) -> None:
    method list_directories (line 120) | def list_directories(parent_path: Path) -> List[str]:
    method sanitize_filename (line 139) | def sanitize_filename(filename: str) -> str:

FILE: backend/utils/tensorboard_reader.py
  class TensorBoardReader (line 9) | class TensorBoardReader:
    method __init__ (line 14) | def __init__(self, logdir: str):
    method _load_events (line 27) | def _load_events(self):
    method get_scalar_tags (line 43) | def get_scalar_tags(self) -> List[str]:
    method get_scalar_data (line 49) | def get_scalar_data(self, tag: str, max_points: Optional[int] = None) ...
    method get_loss_metrics (line 88) | def get_loss_metrics(self, max_points: Optional[int] = 500) -> Dict:
    method get_learning_rate (line 108) | def get_learning_rate(self, max_points: Optional[int] = 500) -> List[D...
    method get_timing_metrics (line 112) | def get_timing_metrics(self, max_points: Optional[int] = 500) -> Dict:
    method get_all_metrics (line 121) | def get_all_metrics(self, max_points: Optional[int] = 500) -> Dict:

FILE: config/configuration_vibevoice.py
  class InferencePhase (line 11) | class InferencePhase:
  class QwenConfig (line 20) | class QwenConfig:
    method __init__ (line 26) | def __init__(
    method from_config (line 77) | def from_config(cls, config):
  class VibeVoiceAcousticTokenizerConfig (line 80) | class VibeVoiceAcousticTokenizerConfig:
    method __init__ (line 83) | def __init__(
  class VibeVoiceSemanticTokenizerConfig (line 142) | class VibeVoiceSemanticTokenizerConfig:
    method __init__ (line 145) | def __init__(
  class VibeVoiceDiffusionHeadConfig (line 195) | class VibeVoiceDiffusionHeadConfig:
    method __init__ (line 198) | def __init__(
  class VibeVoiceConfig (line 227) | class VibeVoiceConfig:
    method __init__ (line 249) | def __init__(
    method from_dict (line 309) | def from_dict(cls, config_dict: Dict, **kwargs):

FILE: demo/audio_denoise_deepfilter.py
  function parse_args (line 27) | def parse_args():
  function get_device (line 138) | def get_device(device_arg: str) -> str:
  function load_audio (line 145) | def load_audio(file_path: str) -> Tuple[torch.Tensor, int]:
  function save_audio (line 159) | def save_audio(
  function load_deepfilter_model (line 201) | def load_deepfilter_model(model_name: str, device: str, post_filter: boo...
  function denoise_audio_deepfilter (line 240) | def denoise_audio_deepfilter(
  function process_directory (line 349) | def process_directory(
  function main (line 403) | def main():

FILE: demo/audio_denose.py
  function parse_args (line 23) | def parse_args():
  function get_device (line 117) | def get_device(device_arg: str) -> str:
  function load_audio (line 124) | def load_audio(file_path: str, target_sr: int = 16000, resample: bool = ...
  function save_audio (line 159) | def save_audio(
  function denoise_audio (line 201) | def denoise_audio(
  function main (line 283) | def main():

FILE: demo/convert_model.py
  function parse_args (line 12) | def parse_args():
  function main (line 20) | def main():

FILE: demo/list_modules.py
  function list_moduels (line 7) | def list_moduels(model_path: str):

FILE: demo/local_file_inference.py
  class VoiceMapper (line 16) | class VoiceMapper:
    method __init__ (line 19) | def __init__(self):
    method setup_voice_presets (line 36) | def setup_voice_presets(self):
    method get_voice_path (line 74) | def get_voice_path(self, speaker_name: str) -> str:
  function parse_txt_script (line 91) | def parse_txt_script(txt_content: str) -> Tuple[List[str], List[str]]:
  function parse_args (line 137) | def parse_args():
  function load_model (line 209) | def load_model(model_file: str = None,
  function main (line 233) | def main():

FILE: demo/train.py
  function main (line 12) | def main():

FILE: demo/verify_dataset.py
  class Colors (line 21) | class Colors:
  function parse_metadata_file (line 31) | def parse_metadata_file(file_path: str) -> Dict[str, str]:
  function extract_audio_id (line 45) | def extract_audio_id(file_path: str) -> str:
  function load_dataset (line 50) | def load_dataset(dataset_path: str) -> List[Dict]:
  function verify_dataset (line 67) | def verify_dataset(
  function verify_copy_script (line 200) | def verify_copy_script(script_path: str, dataset_path: str) -> Tuple[int...
  function main (line 276) | def main():

FILE: demo/view_tensorfile.py
  function print_metadata (line 14) | def print_metadata(metadata: dict, format_type: str = "plain"):
  function list_tensor_keys (line 34) | def list_tensor_keys(file_path: str, key_pattern: str = None, show_shape...
  function get_file_stats (line 73) | def get_file_stats(file_path: str):
  function main (line 108) | def main():

FILE: demo/vram_offload_animation.py
  function t (line 155) | def t(key: str, **kwargs) -> str:
  class LayerOffloadingDemo (line 163) | class LayerOffloadingDemo(Scene):
    method construct (line 169) | def construct(self):
    method create_memory_regions (line 197) | def create_memory_regions(self):
    method create_layers (line 260) | def create_layers(self):
    method show_initial_state (line 284) | def show_initial_state(self, all_layers, gpu_region, cpu_region):
    method demonstrate_inference_flow (line 383) | def demonstrate_inference_flow(self, all_layers, gpu_region, cpu_regio...
    method show_summary (line 542) | def show_summary(self):
  class SimpleLayerFlow (line 604) | class SimpleLayerFlow(Scene):
    method construct (line 609) | def construct(self):
  class InferenceTimeline (line 779) | class InferenceTimeline(Scene):
    method construct (line 784) | def construct(self):

FILE: frontend/app/dataset/detail/page.tsx
  function DatasetDetailContent (line 14) | function DatasetDetailContent({ datasetId }: { datasetId: string }) {
  function DatasetDetailPageContent (line 356) | function DatasetDetailPageContent() {
  function DatasetDetailPage (line 397) | function DatasetDetailPage() {

FILE: frontend/app/dataset/page.tsx
  function DatasetPageContent (line 12) | function DatasetPageContent() {
  function DatasetPage (line 162) | function DatasetPage() {

FILE: frontend/app/fine-tuning/page.tsx
  function TrainingContent (line 13) | function TrainingContent() {
  function TrainingPage (line 89) | function TrainingPage() {

FILE: frontend/app/generate-voice/page.tsx
  function GenerateVoiceContent (line 13) | function GenerateVoiceContent() {
  function GenerateVoicePage (line 89) | function GenerateVoicePage() {

FILE: frontend/app/layout.tsx
  function RootLayout (line 39) | function RootLayout({

FILE: frontend/app/page.tsx
  function Home (line 3) | function Home() {

FILE: frontend/app/quick-generate/page.tsx
  constant PRESET_INFO (line 13) | const PRESET_INFO = {
  function QuickGenerateContent (line 31) | function QuickGenerateContent() {
  function QuickGeneratePage (line 1291) | function QuickGeneratePage() {

FILE: frontend/app/speaker-role/page.tsx
  function SpeakerRolePage (line 10) | function SpeakerRolePage() {

FILE: frontend/app/voice-editor/page.tsx
  function VoiceEditorContent (line 16) | function VoiceEditorContent() {
  function VoiceEditorPage (line 223) | function VoiceEditorPage() {

FILE: frontend/components/AudioPlayer.tsx
  type AudioPlayerProps (line 8) | interface AudioPlayerProps {
  function AudioPlayer (line 16) | function AudioPlayer({ voiceFileUrl, voiceFileName, onChangeVoice, onTri...

FILE: frontend/components/AudioUploader.tsx
  type AudioUploaderProps (line 6) | interface AudioUploaderProps {
  function AudioUploader (line 10) | function AudioUploader({ onUpload }: AudioUploaderProps) {

FILE: frontend/components/CreateDatasetModal.tsx
  type CreateDatasetModalProps (line 6) | interface CreateDatasetModalProps {
  function CreateDatasetModal (line 11) | function CreateDatasetModal({ onClose, onCreate }: CreateDatasetModalPro...

FILE: frontend/components/CurrentGeneration.tsx
  function CurrentGeneration (line 17) | function CurrentGeneration() {

FILE: frontend/components/CurrentTraining.tsx
  constant METRICS_REFRESH_INTERVAL_OPTIONS (line 12) | const METRICS_REFRESH_INTERVAL_OPTIONS = [5, 10, 15, 30, 60, 120];
  constant STATE_REFRESH_INTERVAL_OPTIONS (line 13) | const STATE_REFRESH_INTERVAL_OPTIONS = [2, 3, 5, 10, 15, 30];
  function extractDatasetIdFromPath (line 17) | function extractDatasetIdFromPath(datasetPath: string | null): string | ...
  function CurrentTraining (line 23) | function CurrentTraining() {

FILE: frontend/components/DatasetCard.tsx
  type Dataset (line 7) | interface Dataset {
  type DatasetCardProps (line 16) | interface DatasetCardProps {
  function DatasetCard (line 24) | function DatasetCard({ dataset, onDelete, onImport, onExport, onViewDeta...

FILE: frontend/components/DatasetItemModal.tsx
  type DatasetItemModalProps (line 7) | interface DatasetItemModalProps {
  function DatasetItemModal (line 14) | function DatasetItemModal({ onClose, onSave, initialText = "", mode }: D...

FILE: frontend/components/DatasetItemRow.tsx
  type DatasetItemRowProps (line 7) | interface DatasetItemRowProps {
  function DatasetItemRow (line 18) | function DatasetItemRow({

FILE: frontend/components/DialogEditor.tsx
  type DialogEditorProps (line 8) | interface DialogEditorProps {
  function DialogEditor (line 23) | function DialogEditor({

FILE: frontend/components/DialogPreview.tsx
  type DialogPreviewProps (line 7) | interface DialogPreviewProps {
  function DialogPreview (line 12) | function DialogPreview({ dialogLines, speakers }: DialogPreviewProps) {

FILE: frontend/components/GenerationForm.tsx
  constant PRESET_INFO (line 12) | const PRESET_INFO = {
  function GenerationForm (line 33) | function GenerationForm() {

FILE: frontend/components/GenerationHistory.tsx
  function GenerationHistory (line 18) | function GenerationHistory() {

FILE: frontend/components/ImportDatasetModal.tsx
  type ImportDatasetModalProps (line 6) | interface ImportDatasetModalProps {
  function ImportDatasetModal (line 12) | function ImportDatasetModal({ datasetName, onClose, onImport }: ImportDa...

FILE: frontend/components/InlineAudioPlayer.tsx
  type InlineAudioPlayerProps (line 5) | interface InlineAudioPlayerProps {
  function InlineAudioPlayer (line 11) | function InlineAudioPlayer({ audioUrl, filename, onPlay }: InlineAudioPl...

FILE: frontend/components/LayoutWrapper.tsx
  function LayoutWrapper (line 8) | function LayoutWrapper({ children }: { children: React.ReactNode }) {

FILE: frontend/components/Navigation.tsx
  type MenuItem (line 12) | interface MenuItem {
  type MenuGroup (line 19) | interface MenuGroup {
  function Navigation (line 90) | function Navigation() {

FILE: frontend/components/PresetVoiceManager.tsx
  type PresetVoiceManagerProps (line 10) | interface PresetVoiceManagerProps {
  function PresetVoiceManagerContent (line 15) | function PresetVoiceManagerContent({ onClose }: { onClose: () => void }) {
  function PresetVoiceManager (line 539) | function PresetVoiceManager({ isOpen, onClose }: PresetVoiceManagerProps) {

FILE: frontend/components/PresetVoiceSelector.tsx
  type PresetVoiceSelectorProps (line 8) | interface PresetVoiceSelectorProps {
  function PresetVoiceSelector (line 12) | function PresetVoiceSelector({ onSelect }: PresetVoiceSelectorProps) {

FILE: frontend/components/ProjectSelector.tsx
  function validateProjectName (line 18) | function validateProjectName(name: string, t: (key: string) => string): ...
  function ProjectSelector (line 41) | function ProjectSelector() {

FILE: frontend/components/QuickGenerateHistory.tsx
  type QuickGenerateHistoryProps (line 9) | interface QuickGenerateHistoryProps {
  function QuickGenerateHistory (line 15) | function QuickGenerateHistory({ onSelectGeneration, currentGenerationId,...

FILE: frontend/components/QuickGenerateNavigation.tsx
  function QuickGenerateNavigation (line 10) | function QuickGenerateNavigation() {

FILE: frontend/components/SessionManager.tsx
  function SessionManager (line 11) | function SessionManager() {

FILE: frontend/components/SpeakerList.tsx
  type SpeakerListProps (line 5) | interface SpeakerListProps {
  function SpeakerList (line 13) | function SpeakerList({

FILE: frontend/components/SpeakerRoleManager.tsx
  function SpeakerRoleManager (line 15) | function SpeakerRoleManager() {

FILE: frontend/components/SpeakerSelector.tsx
  type SpeakerSelectorProps (line 6) | interface SpeakerSelectorProps {
  function SpeakerSelector (line 13) | function SpeakerSelector({

FILE: frontend/components/TextEditor.tsx
  type TextEditorProps (line 5) | interface TextEditorProps {
  function TextEditor (line 10) | function TextEditor({ speaker, onContentChange }: TextEditorProps) {

FILE: frontend/components/TrainingForm.tsx
  constant PRESET_INFO (line 12) | const PRESET_INFO = {
  function TrainingForm (line 33) | function TrainingForm() {

FILE: frontend/components/TrainingHistory.tsx
  function extractDatasetIdFromPath (line 15) | function extractDatasetIdFromPath(datasetPath: string | null): string | ...
  function TrainingHistory (line 21) | function TrainingHistory() {

FILE: frontend/components/TrainingMetricsChart.tsx
  type TrainingMetricsChartProps (line 9) | interface TrainingMetricsChartProps {
  function TrainingMetricsChart (line 16) | function TrainingMetricsChart({

FILE: frontend/components/VoicePreview.tsx
  type VoicePreviewProps (line 6) | interface VoicePreviewProps {
  function VoicePreview (line 11) | function VoicePreview({ speaker, onVoiceFileChange }: VoicePreviewProps) {

FILE: frontend/components/VoiceRecorder.tsx
  type VoiceRecorderProps (line 6) | interface VoiceRecorderProps {
  function VoiceRecorder (line 10) | function VoiceRecorder({ onSave }: VoiceRecorderProps) {

FILE: frontend/lib/DatasetContext.tsx
  type DatasetContextType (line 6) | interface DatasetContextType {
  function DatasetProvider (line 21) | function DatasetProvider({ children, projectId }: { children: React.Reac...
  function useDataset (line 157) | function useDataset(): DatasetContextType {

FILE: frontend/lib/DatasetItemsContext.tsx
  constant ITEMS_PER_PAGE_KEY_PREFIX (line 6) | const ITEMS_PER_PAGE_KEY_PREFIX = "vibevoice-dataset-items-per-page-";
  constant DEFAULT_ITEMS_PER_PAGE (line 7) | const DEFAULT_ITEMS_PER_PAGE = 20;
  constant MIN_ITEMS_PER_PAGE (line 9) | const MIN_ITEMS_PER_PAGE = 1;
  constant MAX_ITEMS_PER_PAGE (line 10) | const MAX_ITEMS_PER_PAGE = 500;
  function getSavedItemsPerPage (line 13) | function getSavedItemsPerPage(datasetId: string): number {
  function saveItemsPerPage (line 26) | function saveItemsPerPage(datasetId: string, count: number): void {
  type DatasetItemsContextType (line 31) | interface DatasetItemsContextType {
  function DatasetItemsProvider (line 51) | function DatasetItemsProvider({
  function useDatasetItems (line 212) | function useDatasetItems(): DatasetItemsContextType {

FILE: frontend/lib/GenerationContext.tsx
  type GenerationContextType (line 8) | interface GenerationContextType {
  type GenerationProviderProps (line 26) | interface GenerationProviderProps {
  function GenerationProvider (line 31) | function GenerationProvider({ children, projectId }: GenerationProviderP...
  function useGeneration (line 294) | function useGeneration() {

FILE: frontend/lib/GlobalTaskContext.tsx
  type GlobalTaskContextType (line 7) | interface GlobalTaskContextType {
  function GlobalTaskProvider (line 14) | function GlobalTaskProvider({ children }: { children: React.ReactNode }) {
  function useGlobalTask (line 56) | function useGlobalTask() {

FILE: frontend/lib/PresetVoiceContext.tsx
  type PresetVoiceContextType (line 7) | interface PresetVoiceContextType {
  constant DEFAULT_LIMIT (line 48) | const DEFAULT_LIMIT = 10;
  function PresetVoiceProvider (line 50) | function PresetVoiceProvider({ children }: { children: ReactNode }) {
  function usePresetVoice (line 191) | function usePresetVoice() {

FILE: frontend/lib/ProjectContext.tsx
  function ProjectProvider (line 10) | function ProjectProvider({ children }: { children: React.ReactNode }) {
  function useProject (line 159) | function useProject() {

FILE: frontend/lib/SessionContext.tsx
  type CreateSessionOptions (line 9) | interface CreateSessionOptions {
  type SessionContextType (line 16) | interface SessionContextType {
  function SessionProvider (line 30) | function SessionProvider({ children }: { children: React.ReactNode }) {
  function useSession (line 398) | function useSession() {

FILE: frontend/lib/SpeakerRoleContext.tsx
  type SpeakerRoleContextType (line 8) | interface SpeakerRoleContextType {
  function SpeakerRoleProvider (line 23) | function SpeakerRoleProvider({ children, projectId }: { children: React....
  function useSpeakerRole (line 272) | function useSpeakerRole() {

FILE: frontend/lib/TrainingContext.tsx
  type TrainingContextType (line 7) | interface TrainingContextType {
  type TrainingProviderProps (line 29) | interface TrainingProviderProps {
  function TrainingProvider (line 34) | function TrainingProvider({ children, projectId }: TrainingProviderProps) {
  function useTraining (line 336) | function useTraining() {

FILE: frontend/lib/api.ts
  constant API_BASE_URL (line 45) | const API_BASE_URL = process.env.NODE_ENV === 'development'
  type Project (line 49) | interface Project {
  type Speaker (line 57) | interface Speaker {
  type SessionMode (line 65) | type SessionMode = 'dialogue' | 'narration';
  type DialogSession (line 67) | interface DialogSession {
  type Dataset (line 78) | interface Dataset {
  type LoRAFile (line 87) | interface LoRAFile {
  type DatasetItem (line 94) | interface DatasetItem {
  class ApiClient (line 100) | class ApiClient {
    method constructor (line 103) | constructor(baseUrl: string = API_BASE_URL) {
    method fetch (line 107) | private async fetch<T>(
    method listProjects (line 147) | async listProjects(): Promise<{ projects: Project[]; count: number }> {
    method getProject (line 151) | async getProject(projectId: string): Promise<Project> {
    method createProject (line 155) | async createProject(data: {
    method updateProject (line 165) | async updateProject(
    method deleteProject (line 175) | async deleteProject(projectId: string): Promise<{ message: string; pro...
    method listSpeakers (line 183) | async listSpeakers(projectId: string): Promise<{ speakers: Speaker[]; ...
    method getSpeaker (line 187) | async getSpeaker(projectId: string, speakerId: string): Promise<Speake...
    method createSpeaker (line 193) | async createSpeaker(
    method updateSpeaker (line 223) | async updateSpeaker(
    method deleteSpeaker (line 237) | async deleteSpeaker(
    method updateVoiceFile (line 249) | async updateVoiceFile(
    method trimVoiceFile (line 274) | async trimVoiceFile(
    method getVoiceFileUrl (line 289) | getVoiceFileUrl(projectId: string, speakerId: string): string {
    method createSpeakerFromPreset (line 293) | async createSpeakerFromPreset(
    method listPresetVoices (line 311) | async listPresetVoices(options?: {
    method getPresetVoice (line 335) | async getPresetVoice(filename: string): Promise<PresetVoice> {
    method createPresetVoice (line 339) | async createPresetVoice(data: {
    method deletePresetVoice (line 377) | async deletePresetVoice(filename: string): Promise<DeletePresetRespons...
    method batchDeletePresetVoices (line 383) | async batchDeletePresetVoices(
    method listPresetLanguages (line 392) | async listPresetLanguages(): Promise<{ languages: PresetLanguage[] }> {
    method getPresetPreviewUrl (line 396) | getPresetPreviewUrl(filename: string): string {
    method listSessions (line 402) | async listSessions(projectId: string): Promise<{ sessions: DialogSessi...
    method getSession (line 406) | async getSession(projectId: string, sessionId: string): Promise<Dialog...
    method createSession (line 412) | async createSession(
    method updateSession (line 428) | async updateSession(
    method deleteSession (line 448) | async deleteSession(
    method getSessionText (line 460) | async getSessionText(
    method getSessionDownloadUrl (line 469) | getSessionDownloadUrl(projectId: string, sessionId: string): string {
    method getCurrentTask (line 475) | async getCurrentTask(): Promise<CurrentTaskResponse> {
    method createGeneration (line 481) | async createGeneration(
    method getCurrentGeneration (line 491) | async getCurrentGeneration(): Promise<CurrentGenerationResponse> {
    method getCurrentGenerationForProject (line 495) | async getCurrentGenerationForProject(projectId: string): Promise<Curre...
    method listGenerations (line 499) | async listGenerations(projectId: string): Promise<ListGenerationsRespo...
    method getGeneration (line 503) | async getGeneration(projectId: string, requestId: string): Promise<Get...
    method deleteGeneration (line 507) | async deleteGeneration(
    method batchDeleteGenerations (line 519) | async batchDeleteGenerations(
    method getGenerationDownloadUrl (line 538) | getGenerationDownloadUrl(projectId: string, requestId: string): string {
    method getGenerationItemDownloadUrl (line 542) | getGenerationItemDownloadUrl(projectId: string, requestId: string, ite...
    method listDatasets (line 548) | async listDatasets(projectId: string): Promise<{ datasets: Dataset[]; ...
    method getDataset (line 552) | async getDataset(projectId: string, datasetId: string): Promise<Datase...
    method createDataset (line 558) | async createDataset(
    method updateDataset (line 571) | async updateDataset(
    method deleteDataset (line 585) | async deleteDataset(
    method getDatasetExportUrl (line 597) | getDatasetExportUrl(projectId: string, datasetId: string): string {
    method importToExistingDataset (line 601) | async importToExistingDataset(
    method importDataset (line 633) | async importDataset(
    method listDatasetItems (line 672) | async listDatasetItems(
    method createDatasetItem (line 700) | async createDatasetItem(
    method updateDatasetItem (line 740) | async updateDatasetItem(
    method deleteDatasetItem (line 787) | async deleteDatasetItem(
    method createTrainingJob (line 802) | async createTrainingJob(
    method listTrainingStates (line 812) | async listTrainingStates(
    method getCurrentTrainingState (line 818) | async getCurrentTrainingState(
    method getTrainingState (line 824) | async getTrainingState(
    method deleteTrainingJob (line 833) | async deleteTrainingJob(
    method batchDeleteTrainingJobs (line 845) | async batchDeleteTrainingJobs(
    method getTrainingMetrics (line 862) | async getTrainingMetrics(
    method downloadLoRAFile (line 913) | downloadLoRAFile(projectId: string, jobId: string, filename: string): ...
    method listLoRAFiles (line 928) | async listLoRAFiles(projectId: string): Promise<{ lora_files: LoRAFile...
    method startQuickGeneration (line 937) | async startQuickGeneration(data: {
    method getQuickGeneration (line 1002) | async getQuickGeneration(requestId: string): Promise<QuickGenerate> {
    method getCurrentQuickGeneration (line 1009) | async getCurrentQuickGeneration(): Promise<CurrentQuickGenerateRespons...
    method listQuickGenerationHistory (line 1016) | async listQuickGenerationHistory(options?: {
    method deleteQuickGeneration (line 1035) | async deleteQuickGeneration(requestId: string): Promise<{ message: str...
    method getQuickGenerationDownloadUrl (line 1044) | getQuickGenerationDownloadUrl(requestId: string): string {
    method getQuickGenerationItemDownloadUrl (line 1051) | getQuickGenerationItemDownloadUrl(requestId: string, itemIndex: number...
    method getQuickGenerationVoicePreviewUrl (line 1058) | getQuickGenerationVoicePreviewUrl(requestId: string): string {
    method getQuickGenerationVoicePreviewByIndexUrl (line 1065) | getQuickGenerationVoicePreviewByIndexUrl(requestId: string, voiceIndex...

FILE: frontend/lib/audioUtils.ts
  function convertToWav (line 10) | async function convertToWav(audioBlob: Blob): Promise<Blob> {
  function audioBufferToWav (line 35) | function audioBufferToWav(audioBuffer: AudioBuffer): Blob {
  function writeString (line 82) | function writeString(view: DataView, offset: number, string: string): vo...

FILE: frontend/lib/i18n/LanguageContext.tsx
  type Messages (line 9) | type Messages = typeof enMessages;
  type LanguageContextType (line 11) | interface LanguageContextType {
  function getBrowserLocale (line 25) | function getBrowserLocale(): Locale {
  function getSavedLocale (line 38) | function getSavedLocale(): Locale | null {
  function LanguageProvider (line 49) | function LanguageProvider({ children }: { children: React.ReactNode }) {
  function useLanguage (line 119) | function useLanguage() {

FILE: frontend/lib/i18n/config.ts
  type Locale (line 6) | type Locale = (typeof i18n)['locales'][number];

FILE: frontend/next.config.ts
  method rewrites (line 9) | async rewrites() {

FILE: frontend/scripts/generate-version.js
  function getGitVersion (line 14) | function getGitVersion() {
  function main (line 33) | function main() {

FILE: frontend/types/dialog.ts
  type DialogLine (line 1) | interface DialogLine {
  type SpeakerInfo (line 7) | interface SpeakerInfo {
  type SessionMode (line 14) | type SessionMode = 'dialogue' | 'narration';
  type DialogSession (line 16) | interface DialogSession {

FILE: frontend/types/generation.ts
  type InferencePhase (line 9) | enum InferencePhase {
  type ModelDtype (line 21) | type ModelDtype = 'bf16' | 'float8_e4m3fn';
  type GenerationItem (line 27) | interface GenerationItem {
  type GenerationDetails (line 44) | interface GenerationDetails {
  type Generation (line 70) | interface Generation {
  type OffloadingMode (line 101) | type OffloadingMode = 'preset' | 'manual';
  type OffloadingPreset (line 106) | type OffloadingPreset = 'balanced' | 'aggressive' | 'extreme';
  type OffloadingConfig (line 111) | interface OffloadingConfig {
  type OffloadingTimeBreakdown (line 121) | interface OffloadingTimeBreakdown {
  type OffloadingMetrics (line 130) | interface OffloadingMetrics {
  type CreateGenerationRequest (line 145) | interface CreateGenerationRequest {
  type CreateGenerationResponse (line 160) | interface CreateGenerationResponse {
  type CurrentGenerationResponse (line 170) | interface CurrentGenerationResponse {
  type ListGenerationsResponse (line 178) | interface ListGenerationsResponse {
  type GetGenerationResponse (line 186) | interface GetGenerationResponse {
  function getOffloadingConfig (line 193) | function getOffloadingConfig(generation: Generation): OffloadingConfig |...
  function getOffloadingMetrics (line 200) | function getOffloadingMetrics(generation: Generation): OffloadingMetrics...
  function getLoraDisplayName (line 208) | function getLoraDisplayName(loraPath: string | null | undefined): string...
  function isMultiGeneration (line 230) | function isMultiGeneration(generation: Generation): boolean {
  function getGenerationItems (line 238) | function getGenerationItems(generation: Generation): GenerationItem[] {
  function getCompletedItemsCount (line 245) | function getCompletedItemsCount(generation: Generation): number {
  function getMultiGenerationStats (line 253) | function getMultiGenerationStats(generation: Generation): {
  function getAudioFilename (line 295) | function getAudioFilename(audioPath: string): string {

FILE: frontend/types/preset.ts
  type PresetVoice (line 12) | interface PresetVoice {
  type PresetLanguage (line 21) | interface PresetLanguage {
  type ListPresetsResponse (line 27) | interface ListPresetsResponse {
  type ListPresetLanguagesResponse (line 35) | interface ListPresetLanguagesResponse {
  type CreatePresetRequest (line 39) | interface CreatePresetRequest {
  type DeletePresetResponse (line 47) | interface DeletePresetResponse {
  type BatchDeletePresetsResponse (line 52) | interface BatchDeletePresetsResponse {

FILE: frontend/types/project.ts
  type Project (line 1) | interface Project {
  type ProjectContextType (line 9) | interface ProjectContextType {

FILE: frontend/types/quickGenerate.ts
  type QuickGenerateMode (line 10) | type QuickGenerateMode = 'dialogue' | 'narration';
  type QuickGenerateItem (line 15) | interface QuickGenerateItem {
  type QuickGenerateDetails (line 29) | interface QuickGenerateDetails {
  type QuickGenerate (line 38) | interface QuickGenerate {
  type StartQuickGenerateRequest (line 66) | interface StartQuickGenerateRequest {
  type StartQuickGenerateResponse (line 80) | interface StartQuickGenerateResponse {
  type CurrentQuickGenerateResponse (line 90) | interface CurrentQuickGenerateResponse {
  type QuickGenerateHistoryItem (line 98) | interface QuickGenerateHistoryItem {
  type QuickGenerateHistoryResponse (line 111) | interface QuickGenerateHistoryResponse {
  function isQuickMultiGeneration (line 120) | function isQuickMultiGeneration(quickGen: QuickGenerate): boolean {
  function getQuickCompletedItemsCount (line 127) | function getQuickCompletedItemsCount(quickGen: QuickGenerate): number {
  function getQuickMultiGenerationStats (line 137) | function getQuickMultiGenerationStats(quickGen: QuickGenerate): {

FILE: frontend/types/speaker.ts
  type Speaker (line 1) | interface Speaker {
  type SpeakerData (line 8) | interface SpeakerData {
  type SpeakerRole (line 14) | interface SpeakerRole {
  type VoiceFile (line 24) | interface VoiceFile {
  type AudioFileExtension (line 32) | type AudioFileExtension = '.wav' | '.mp3' | '.m4a' | '.flac' | '.webm';
  constant ACCEPTED_AUDIO_TYPES (line 33) | const ACCEPTED_AUDIO_TYPES = ['audio/wav', 'audio/mpeg', 'audio/mp4', 'a...
  constant ACCEPTED_AUDIO_EXTENSIONS (line 34) | const ACCEPTED_AUDIO_EXTENSIONS: AudioFileExtension[] = ['.wav', '.mp3',...

FILE: frontend/types/task.ts
  type TaskType (line 12) | type TaskType = 'inference' | 'training' | 'quick_generation' | null;
  type TaskData (line 17) | type TaskData = Generation | TrainingState | QuickGenerate | null;
  type CurrentTask (line 22) | interface CurrentTask {
  type CurrentTaskResponse (line 31) | interface CurrentTaskResponse {
  function isInferenceTask (line 39) | function isInferenceTask(task: CurrentTask): task is CurrentTask & { typ...
  function isTrainingTask (line 46) | function isTrainingTask(task: CurrentTask): task is CurrentTask & { type...
  function isQuickGenerationTask (line 53) | function isQuickGenerationTask(task: CurrentTask): task is CurrentTask &...
  function hasActiveTask (line 60) | function hasActiveTask(task: CurrentTask | null): boolean {

FILE: frontend/types/training.ts
  type TrainingStatus (line 8) | type TrainingStatus = 'Prepare' | 'Training' | 'Completed' | 'Failed';
  type OptimizerType (line 13) | type OptimizerType = 'AdamW' | 'AdamW8bit';
  type TrainingDtype (line 18) | type TrainingDtype = 'bfloat16' | 'float8_e4m3fn';
  type TrainingState (line 23) | interface TrainingState {
  type TrainConfig (line 82) | interface TrainConfig {
  type CreateTrainingRequest (line 114) | interface CreateTrainingRequest {
  type CreateTrainingResponse (line 122) | interface CreateTrainingResponse {
  type CurrentTrainingResponse (line 131) | interface CurrentTrainingResponse {
  type ListTrainingStatesResponse (line 139) | interface ListTrainingStatesResponse {
  type GetTrainingStateResponse (line 147) | interface GetTrainingStateResponse {
  type DeleteTrainingResponse (line 154) | interface DeleteTrainingResponse {
  type BatchDeleteTrainingResponse (line 162) | interface BatchDeleteTrainingResponse {
  type MetricDataPoint (line 173) | interface MetricDataPoint {
  type LossMetrics (line 182) | interface LossMetrics {
  type TimingMetrics (line 194) | interface TimingMetrics {
  type TrainingMetrics (line 203) | interface TrainingMetrics {
  type GetTrainingMetricsResponse (line 213) | interface GetTrainingMetricsResponse {
  constant DEFAULT_TRAIN_CONFIG (line 222) | const DEFAULT_TRAIN_CONFIG: Partial<TrainConfig> = {

FILE: test_generation_offloading.py
  class GenerationMetrics (line 61) | class GenerationMetrics:
  function parse_txt_script (line 81) | def parse_txt_script(txt_content: str) -> Tuple[List[str], List[str]]:
  function get_vram_usage (line 121) | def get_vram_usage() -> Tuple[float, float]:
  function load_model (line 132) | def load_model(model_file: str, config_path: str, dtype: torch.dtype,
  function run_generation (line 177) | def run_generation(args, offload_config: Optional[OffloadConfig] = None)...
  function print_summary (line 347) | def print_summary(metrics: GenerationMetrics, config_name: str = "Defaul...
  function benchmark_configurations (line 399) | def benchmark_configurations(args):
  function parse_args (line 471) | def parse_args():
  function main (line 597) | def main():

FILE: test_offloading.py
  function get_memory_usage (line 35) | def get_memory_usage():
  function test_model_loading (line 44) | def test_model_loading(offload_config=None, model_path="models/converted...
  function test_inference_speed (line 119) | def test_inference_speed(model, num_iterations=5):
  function main (line 193) | def main():

FILE: tests/test_logging.py
  function test_basic_usage (line 12) | def test_basic_usage():
  function test_custom_level (line 23) | def test_custom_level():
  function test_custom_format (line 32) | def test_custom_format():
  function test_file_logging (line 44) | def test_file_logging():
  function test_custom_handler (line 64) | def test_custom_handler():
  function test_logger_reuse (line 94) | def test_logger_reuse():
  function test_environment_levels (line 104) | def test_environment_levels():
  function test_reset_logger (line 119) | def test_reset_logger():
  function test_hierarchical_loggers (line 133) | def test_hierarchical_loggers():
  function test_no_propagation (line 148) | def test_no_propagation():
  function demo_real_world_usage (line 161) | def demo_real_world_usage():

FILE: tests/test_lora_network.py
  class TestLoRANetwork (line 6) | class TestLoRANetwork:
    method test_includes_layers (line 9) | def test_includes_layers(self):
    method test_includes_layers_specific_patterns (line 76) | def test_includes_layers_specific_patterns(self):

FILE: tests/test_training_service.py
  class TestDateTimeEncoder (line 16) | class TestDateTimeEncoder:
    method test_datetime_serialization (line 19) | def test_datetime_serialization(self):
    method test_nested_datetime_serialization (line 29) | def test_nested_datetime_serialization(self):
    method test_non_datetime_objects_still_work (line 46) | def test_non_datetime_objects_still_work(self):
  class TestFileHandlerJsonSerialization (line 62) | class TestFileHandlerJsonSerialization:
    method setup_method (line 65) | def setup_method(self):
    method teardown_method (line 70) | def teardown_method(self):
    method test_write_json_with_datetime (line 74) | def test_write_json_with_datetime(self):
    method test_write_json_atomic_with_datetime (line 94) | def test_write_json_atomic_with_datetime(self):
  class TestTrainingStateJsonSerialization (line 115) | class TestTrainingStateJsonSerialization:
    method setup_method (line 118) | def setup_method(self):
    method teardown_method (line 123) | def teardown_method(self):
    method test_training_state_round_trip (line 127) | def test_training_state_round_trip(self):
    method test_training_state_with_datetime_objects_in_dict (line 158) | def test_training_state_with_datetime_objects_in_dict(self):
    method test_fallback_datetime_encoder_handles_raw_datetime (line 191) | def test_fallback_datetime_encoder_handles_raw_datetime(self):
  class TestTrainingServiceListJobs (line 216) | class TestTrainingServiceListJobs:
    method setup_method (line 219) | def setup_method(self):
    method teardown_method (line 224) | def teardown_method(self):
    method test_list_jobs_with_orphaned_training_status (line 228) | def test_list_jobs_with_orphaned_training_status(self):

FILE: util/float8_scale.py
  function cast_bias_weight (line 12) | def cast_bias_weight(module, input :torch.Tensor=None, dtype: torch.dtyp...
  class ResetParametersMixin (line 38) | class ResetParametersMixin:
    method reset_parameters (line 39) | def reset_parameters(self):
  class AutoCast (line 42) | class AutoCast:
    class Linear (line 43) | class Linear(nn.Linear, ResetParametersMixin):
      method fp8_linear (line 45) | def fp8_linear(self, input):
      method forward_comfy_cast_weights (line 90) | def forward_comfy_cast_weights(self, input):
      method forward (line 98) | def forward(self, *args, **kwargs):
    class Conv1d (line 103) | class Conv1d(torch.nn.Conv1d, ResetParametersMixin):
      method reset_parameters (line 104) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 107) | def forward_comfy_cast_weights(self, input):
      method forward (line 111) | def forward(self, *args, **kwargs):
    class Conv2d (line 117) | class Conv2d(torch.nn.Conv2d, ResetParametersMixin):
      method reset_parameters (line 118) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 121) | def forward_comfy_cast_weights(self, input):
      method forward (line 125) | def forward(self, *args, **kwargs):
    class GroupNorm (line 131) | class GroupNorm(torch.nn.GroupNorm, ResetParametersMixin):
      method reset_parameters (line 132) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 135) | def forward_comfy_cast_weights(self, input):
      method forward (line 139) | def forward(self, *args, **kwargs):
    class LayerNorm (line 145) | class LayerNorm(torch.nn.LayerNorm, ResetParametersMixin):
      method reset_parameters (line 146) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 149) | def forward_comfy_cast_weights(self, input):
      method forward (line 157) | def forward(self, *args, **kwargs):
    class QwenRMSNorm (line 163) | class QwenRMSNorm(QwenRMSNorm, ResetParametersMixin):
      method reset_parameters (line 164) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 168) | def forward_comfy_cast_weights(self, input):
      method forward (line 176) | def forward(self, *args, **kwargs):
    class ConvTranspose2d (line 182) | class ConvTranspose2d(torch.nn.ConvTranspose2d, ResetParametersMixin):
      method reset_parameters (line 183) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 186) | def forward_comfy_cast_weights(self, input, output_size=None):
      method forward (line 197) | def forward(self, *args, **kwargs):
    class ConvTranspose1d (line 203) | class ConvTranspose1d(torch.nn.ConvTranspose1d, ResetParametersMixin):
      method reset_parameters (line 204) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 207) | def forward_comfy_cast_weights(self, input, output_size=None):
      method forward (line 218) | def forward(self, *args, **kwargs):
    class Embedding (line 224) | class Embedding(torch.nn.Embedding, ResetParametersMixin):
      method reset_parameters (line 225) | def reset_parameters(self):
      method forward_comfy_cast_weights (line 229) | def forward_comfy_cast_weights(self, input, out_dtype=None):
      method forward (line 234) | def forward(self, *args, **kwargs):

FILE: util/logger.py
  function _get_log_level (line 50) | def _get_log_level(level: Optional[Union[int, str]] = None) -> int:
  function _create_console_handler (line 80) | def _create_console_handler(
  function _create_file_handler (line 102) | def _create_file_handler(
  function get_logger (line 153) | def get_logger(
  function configure_root_logger (line 271) | def configure_root_logger(
  function reset_logger (line 298) | def reset_logger(name: str) -> None:
  function reset_all_loggers (line 311) | def reset_all_loggers() -> None:

FILE: util/logger_examples.py
  class VoiceGenerator (line 43) | class VoiceGenerator:
    method __init__ (line 44) | def __init__(self):
    method generate (line 47) | def generate(self, text):

FILE: util/model_utils.py
  function addnet_hash_legacy (line 11) | def addnet_hash_legacy(b):
  function addnet_hash_safetensors (line 19) | def addnet_hash_safetensors(b):
  function precalculate_safetensors_hashes (line 36) | def precalculate_safetensors_hashes(tensors, metadata):
  function merge_lora_weights (line 53) | def merge_lora_weights(model: nn.Module, lora_path: str, lora_weight: fl...

FILE: util/rand_init.py
  function get_generator (line 7) | def get_generator(seeds: int = 42, force_set: bool = False) -> torch.Gen...

FILE: util/safetensors_util.py
  class SafetensorFileInfo (line 7) | class SafetensorFileInfo:
    method __init__ (line 8) | def __init__(self, filename:  str):
    method _read_header (line 13) | def _read_header(self, file):
    method keys (line 25) | def keys(self):
  class MemoryEfficientSafeOpen (line 34) | class MemoryEfficientSafeOpen:
    method __init__ (line 41) | def __init__(self, filename):
    method __enter__ (line 51) | def __enter__(self):
    method __exit__ (line 55) | def __exit__(self, exc_type, exc_val, exc_tb):
    method keys (line 59) | def keys(self):
    method metadata (line 67) | def metadata(self) -> Dict[str, str]:
    method _read_header (line 75) | def _read_header(self):
    method get_tensor (line 87) | def get_tensor(self, key: str, device: Optional[torch.device] = None, ...
    method _deserialize_tensor (line 162) | def _deserialize_tensor(self, byte_tensor: torch.Tensor, metadata: Dict):
    method _get_torch_dtype (line 183) | def _get_torch_dtype(dtype_str):
    method _convert_float8 (line 213) | def _convert_float8(byte_tensor, dtype_str, shape):
  class MultipleSafetensorLoader (line 236) | class MultipleSafetensorLoader:
    method __init__ (line 238) | def __init__(self, model_index_file: str):
    method load_dict (line 252) | def load_dict(self) -> Dict:

FILE: util/vibevoice_norm.py
  class QwenRMSNorm (line 6) | class QwenRMSNorm(nn.Module):
    method __init__ (line 7) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 15) | def forward(self, hidden_states, weights=None):
    method extra_repr (line 26) | def extra_repr(self):
  class RMSNorm (line 29) | class RMSNorm(nn.Module):
    method __init__ (line 30) | def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=Tru...
    method _norm (line 41) | def _norm(self, x):
    method forward (line 44) | def forward(self, x):
    method extra_repr (line 50) | def extra_repr(self) -> str:

FILE: vibevoice/generation/visitor.py
  class GenerationVisitor (line 5) | class GenerationVisitor(ABC):
    method visit_preprocessing (line 8) | def visit_preprocessing(self, timestamp: float = None):
    method visit_inference_start (line 12) | def visit_inference_start(self, scripts: List[str] = None,
    method visit_inference_batch_start (line 19) | def visit_inference_batch_start(self, batch_index: int, seeds: int):
    method visit_inference_batch_end (line 23) | def visit_inference_batch_end(self, batch_index: int):
    method visit_inference_save_audio_file (line 27) | def visit_inference_save_audio_file(self, output_audio_path: str = None,
    method visit_inference_step_start (line 38) | def visit_inference_step_start(self, current_step: int, total_steps: i...
    method visit_inference_step_end (line 42) | def visit_inference_step_end(self, current_step: int, total_steps: int):
    method visit_completed (line 46) | def visit_completed(self, message: str = None):
    method visit_failed (line 50) | def visit_failed(self, message: str, failure_type: str):

FILE: vibevoice/lora/lora_network.py
  class LoRAModule (line 13) | class LoRAModule(nn.Module):
    method __init__ (line 25) | def __init__(self,
    method apply_to (line 57) | def apply_to(self):
    method forward (line 63) | def forward(self, x):
  class LoRANetwork (line 94) | class LoRANetwork(nn.Module):
    method __init__ (line 96) | def __init__(
    method _includes_layers (line 183) | def _includes_layers(self) -> List[re.Pattern]:
    method prepare_network (line 214) | def prepare_network(self, args):
    method set_multiplier (line 220) | def set_multiplier(self, multiplier):
    method set_enabled (line 225) | def set_enabled(self, is_enabled):
    method load_weights (line 229) | def load_weights(self, file):
    method apply_to (line 240) | def apply_to(self):
    method is_mergeable (line 249) | def is_mergeable(self):
    method merge_to (line 252) | def merge_to(self, weights_sd, dtype=None, device=None, non_blocking=F...
    method set_loraplus_lr_ratio (line 273) | def set_loraplus_lr_ratio(self, loraplus_lr_ratio):
    method prepare_optimizer_params (line 278) | def prepare_optimizer_params(self, learning_rate: float = 1e-4, **kwar...
    method enable_gradient_checkpointing (line 323) | def enable_gradient_checkpointing(self):
    method prepare_grad_etc (line 327) | def prepare_grad_etc(self, unet):
    method on_epoch_start (line 330) | def on_epoch_start(self, unet):
    method on_step_start (line 333) | def on_step_start(self):
    method get_trainable_params (line 336) | def get_trainable_params(self):
    method save_weights (line 339) | def save_weights(self, file: str, dtype: torch.dtype, metadata: Dict[s...
    method apply_max_norm_regularization (line 367) | def apply_max_norm_regularization(self, max_norm_value, device):
  function create_network (line 411) | def create_network(original_model: nn.Module,

FILE: vibevoice/modular/adaptive_offload.py
  class AdaptiveOffloadManager (line 20) | class AdaptiveOffloadManager:
    method estimate_vram_usage (line 43) | def estimate_vram_usage(
    method recommend_offload_config (line 104) | def recommend_offload_config(
    method get_available_vram_gb (line 157) | def get_available_vram_gb(device: Optional[torch.device] = None) -> fl...
    method get_total_vram_gb (line 186) | def get_total_vram_gb(device: Optional[torch.device] = None) -> float:
    method auto_configure (line 208) | def auto_configure(
    method print_vram_table (line 285) | def print_vram_table(use_float8: bool = True, logger: Optional[logging...
    method get_preset_config (line 335) | def get_preset_config(preset: str) -> OffloadConfig:

FILE: vibevoice/modular/custom_offloading_utils.py
  class OffloadConfig (line 28) | class OffloadConfig:
  class LayerOffloader (line 63) | class LayerOffloader:
    method __init__ (line 79) | def __init__(
    method _setup_offloading (line 136) | def _setup_offloading(self):
    method _initialize_offloaded_layer (line 182) | def _initialize_offloaded_layer(self, layer_idx: int):
    method _copy_layer_gpu_to_staging_buffer (line 227) | def _copy_layer_gpu_to_staging_buffer(self, layer_idx: int):
    method _release_layer_gpu_memory (line 279) | def _release_layer_gpu_memory(self, layer_idx: int):
    method _async_move_to_gpu (line 319) | def _async_move_to_gpu(self, layer_idx: int):
    method _async_move_to_cpu (line 332) | def _async_move_to_cpu(self, layer_idx: int):
    method _ensure_layer_on_gpu (line 348) | def _ensure_layer_on_gpu(self, layer_idx: int):
    method _register_hooks (line 397) | def _register_hooks(self):
    method _pre_forward_transfer (line 420) | def _pre_forward_transfer(self, layer_idx: int, _module: nn.Module, ar...
    method _post_forward_transfer (line 455) | def _post_forward_transfer(self, layer_idx: int, _module: nn.Module, o...
    method _start_async_prefetch (line 514) | def _start_async_prefetch(self, layer_idx: int):
    method _prefetch_layer_sync (line 557) | def _prefetch_layer_sync(self, layer_idx: int):
    method synchronize (line 574) | def synchronize(self):
    method get_memory_stats (line 579) | def get_memory_stats(self) -> Dict[str, Any]:
    method get_stats (line 607) | def get_stats(self) -> Dict[str, Any]:
    method _print_profile_summary (line 658) | def _print_profile_summary(self):
    method print_stats (line 742) | def print_stats(self):
    method cleanup (line 765) | def cleanup(self):
    method __del__ (line 785) | def __del__(self):

FILE: vibevoice/modular/modeling_vibevoice.py
  class VibeVoiceCausalLMOutputWithPast (line 23) | class VibeVoiceCausalLMOutputWithPast:
  class LlamaRMSNorm (line 32) | class LlamaRMSNorm(nn.Module):
    method __init__ (line 33) | def __init__(self, hidden_size, eps=1e-6):
    method forward (line 41) | def forward(self, hidden_states):
    method extra_repr (line 51) | def extra_repr(self):
  class VibeVoiceGenerationOutput (line 57) | class VibeVoiceGenerationOutput(ModelOutput):
  class SpeechConnector (line 71) | class SpeechConnector(nn.Module):
    method __init__ (line 72) | def __init__(self, input_dim, output_dim, dtype: torch.dtype = torch.b...
    method forward (line 79) | def forward(self, features, **kwargs):
  class VibeVoicePreTrainedModel (line 87) | class VibeVoicePreTrainedModel(nn.Module):
    method _init_weights (line 101) | def _init_weights(self, module):
  class VibeVoiceModel (line 123) | class VibeVoiceModel(VibeVoicePreTrainedModel):
    method __init__ (line 124) | def __init__(self, config: VibeVoiceConfig):
    method get_input_embeddings (line 163) | def get_input_embeddings(self):
    method set_input_embeddings (line 173) | def set_input_embeddings(self, value):
    method set_speech_tokenizers (line 176) | def set_speech_tokenizers(self, acoustic_tokenizer=None, semantic_toke...
    method forward (line 188) | def forward(self,
  class VibeVoiceForConditionalGeneration (line 227) | class VibeVoiceForConditionalGeneration(VibeVoicePreTrainedModel):
    method __init__ (line 231) | def __init__(self, config):
    method get_input_embeddings (line 238) | def get_input_embeddings(self):
    method set_input_embeddings (line 241) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 244) | def get_output_embeddings(self):
    method set_decoder (line 247) | def set_decoder(self, decoder):
    method get_decoder (line 250) | def get_decoder(self):
    method tie_weights (line 253) | def tie_weights(self):
    method set_output_embeddings (line 282) | def set_output_embeddings(self, new_embeddings):
    method forward_speech_features (line 287) | def forward_speech_features(self, speech_tensors=None,
    method forward (line 342) | def forward(self, input_ids: torch.LongTensor = None,

FILE: vibevoice/modular/modeling_vibevoice_inference.py
  class VibeVoiceCausalLMOutputWithPast (line 34) | class VibeVoiceCausalLMOutputWithPast(BaseModelOutputWithPast):
  class VibeVoiceGenerationOutput (line 45) | class VibeVoiceGenerationOutput(ModelOutput):
  class VibeVoiceTokenConstraintProcessor (line 59) | class VibeVoiceTokenConstraintProcessor(LogitsProcessor):
    method __init__ (line 62) | def __init__(self, valid_token_ids: List[int], device: torch.device = ...
    method __call__ (line 65) | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTen...
  class VibeVoiceForConditionalInference (line 74) | class VibeVoiceForConditionalInference(nn.Module):
    method __init__ (line 78) | def __init__(self, config: VibeVoiceConfig):
    method generate_config_from_dict (line 108) | def generate_config_from_dict(self, config_dict: Dict) -> GenerationCo...
    method __del__ (line 113) | def __del__(self):
    method noise_scheduler (line 122) | def noise_scheduler(self):
    method prediction_head (line 126) | def prediction_head(self):
    method speech_scaling_factor (line 130) | def speech_scaling_factor(self):
    method speech_bias_factor (line 134) | def speech_bias_factor(self):
    method acoustic_tokenizer (line 138) | def acoustic_tokenizer(self):
    method semantic_tokenizer (line 142) | def semantic_tokenizer(self):
    method acoustic_connector (line 146) | def acoustic_connector(self):
    method semantic_connector (line 150) | def semantic_connector(self):
    method tie_weights (line 153) | def tie_weights(self):
    method get_input_embeddings (line 160) | def get_input_embeddings(self):
    method set_input_embeddings (line 163) | def set_input_embeddings(self, value):
    method get_output_embeddings (line 166) | def get_output_embeddings(self):
    method set_output_embeddings (line 169) | def set_output_embeddings(self, new_embeddings):
    method set_speech_tokenizers (line 172) | def set_speech_tokenizers(self, acoustic_tokenizer=None, semantic_toke...
    method set_ddpm_inference_steps (line 176) | def set_ddpm_inference_steps(self, num_steps=None):
    method _process_speech_inputs (line 179) | def _process_speech_inputs(self, speech_tensors, speech_masks, speech_...
    method forward (line 209) | def forward(
    method _build_generate_config_model_kwargs (line 284) | def _build_generate_config_model_kwargs(self, generation_config, input...
    method generate (line 353) | def generate(
    method _ensure_prediction_head_on_gpu (line 832) | def _ensure_prediction_head_on_gpu(self):
    method _move_prediction_head_to_cpu (line 838) | def _move_prediction_head_to_cpu(self):
    method sample_speech_tokens (line 847) | def sample_speech_tokens(self, condition, neg_condition, cfg_scale=3.0...
    method _prepare_generation_config (line 872) | def _prepare_generation_config(self, generation_config: Optional[Gener...
    method _prepare_special_tokens (line 906) | def _prepare_special_tokens(
    method from_pretrain (line 965) | def from_pretrain(cls, model_path: str, config: VibeVoiceConfig, devic...
    method _prepare_model_inputs (line 1050) | def _prepare_model_inputs(
    method _get_stopping_criteria (line 1075) | def _get_stopping_criteria(
    method prepare_inputs_for_generation (line 1097) | def prepare_inputs_for_generation(
    method _prepare_cache_for_generation (line 1156) | def _prepare_cache_for_generation(
    method _update_model_kwargs_for_generation (line 1177) | def _update_model_kwargs_for_generation(
    method _cache_dependant_input_preparation (line 1213) | def _cache_dependant_input_preparation(
    method forward_speech_features (line 1242) | def forward_speech_features(self,
    method call_for_train (line 1290) | def call_for_train(self,
    method mask_for_ce (line 1431) | def mask_for_ce(self, labels: torch.Tensor, attention_mask: torch.Tens...

FILE: vibevoice/modular/modular_vibevoice_diffusion_head.py
  class RMSNorm (line 13) | class RMSNorm(nn.Module):
    method __init__ (line 14) | def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=Tru...
    method _norm (line 24) | def _norm(self, x):
    method forward (line 27) | def forward(self, x):
    method extra_repr (line 36) | def extra_repr(self) -> str:
  function modulate (line 39) | def modulate(x, shift, scale):
  class TimestepEmbedder (line 44) | class TimestepEmbedder(nn.Module):
    method __init__ (line 52) | def __init__(self, hidden_size, frequency_embedding_size=256):
    method timestep_embedding (line 63) | def timestep_embedding(t, dim, max_period=10000):
    method forward (line 86) | def forward(self, t):
  class FeedForwardNetwork (line 92) | class FeedForwardNetwork(nn.Module):
    method __init__ (line 100) | def __init__(
    method forward (line 112) | def forward(self, x):
  class HeadLayer (line 121) | class HeadLayer(nn.Module):
    method __init__ (line 131) | def __init__(
    method forward (line 153) | def forward(self, x, c):
  class FinalLayer (line 158) | class FinalLayer(nn.Module):
    method __init__ (line 168) | def __init__(self, hidden_size, output_size, cond_size, norm_eps=1e-5):
    method forward (line 178) | def forward(self, x, c):
  class VibeVoiceDiffusionHead (line 184) | class VibeVoiceDiffusionHead(nn.Module):
    method __init__ (line 198) | def __init__(self, config, dtype: torch.dtype = torch.bfloat16):
    method initialize_weights (line 232) | def initialize_weights(self):
    method forward (line 246) | def forward(

FILE: vibevoice/modular/modular_vibevoice_qwen.py
  function dynamic_rope_update (line 16) | def dynamic_rope_update(rope_forward):
  function default_rope_parameters (line 78) | def default_rope_parameters(
  function rotate_half (line 119) | def rotate_half(x):
  function apply_rotary_pos_emb (line 125) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_di...
  function repeat_kv (line 151) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  function eager_attention_forward (line 162) | def eager_attention_forward(
  function sdpa_attention_forward (line 186) | def sdpa_attention_forward(
  class QwenRotaryEmbedding (line 236) | class QwenRotaryEmbedding(nn.Module):
    method __init__ (line 237) | def __init__(self, config: QwenConfig, device=None):
    method forward (line 255) | def forward(self, x, position_ids):
  class Qwen2MLP (line 269) | class Qwen2MLP(nn.Module):
    method __init__ (line 270) | def __init__(self, config: QwenConfig):
    method forward (line 280) | def forward(self, x):
  class Qwen2Attention (line 285) | class Qwen2Attention(nn.Module):
    method __init__ (line 286) | def __init__(self, config: QwenConfig, layer_idx: int):
    method forward (line 301) | def forward(
  class QwenDecoderLayer (line 347) | class QwenDecoderLayer(nn.Module):
    method __init__ (line 348) | def __init__(self, config: QwenConfig, layer_idx: int):
    method forward (line 359) | def forward(
  class QwenModel (line 397) | class QwenModel(nn.Module):
    method __init__ (line 405) | def __init__(self, config: QwenConfig, dtype: torch.dtype = torch.bflo...
    method forward (line 422) | def forward(
  class Qwen2ForCausalLM (line 496) | class Qwen2ForCausalLM(nn.Module):
    method __init__ (line 497) | def __init__(self, config):
    method forward (line 505) | def forward(
    method from_pretrained (line 541) | def from_pretrained(cls, pretrained_model_name_or_path: str, config: Q...

FILE: vibevoice/modular/modular_vibevoice_text_tokenizer.py
  class VibeVoiceTextTokenizer (line 10) | class VibeVoiceTextTokenizer(Qwen2Tokenizer):
    method __init__ (line 35) | def __init__(
    method _add_vibevoice_special_tokens (line 64) | def _add_vibevoice_special_tokens(self):
    method eos_id (line 85) | def eos_id(self) -> int:
    method speech_start_id (line 90) | def speech_start_id(self) -> int:
    method speech_end_id (line 95) | def speech_end_id(self) -> int:
    method speech_diffusion_id (line 100) | def speech_diffusion_id(self) -> int:
    method pad_id (line 105) | def pad_id(self) -> int:
  class VibeVoiceTextTokenizerFast (line 110) | class VibeVoiceTextTokenizerFast(Qwen2TokenizerFast):
    method __init__ (line 134) | def __init__(
    method _add_vibevoice_special_tokens (line 161) | def _add_vibevoice_special_tokens(self):
    method eos_id (line 184) | def eos_id(self) -> int:
    method speech_start_id (line 189) | def speech_start_id(self) -> int:
    method speech_end_id (line 194) | def speech_end_id(self) -> int:
    method speech_diffusion_id (line 199) | def speech_diffusion_id(self) -> int:
    method pad_id (line 204) | def pad_id(self) -> int:

FILE: vibevoice/modular/modular_vibevoice_tokenizer.py
  class ConvLayerNorm (line 26) | class ConvLayerNorm(nn.LayerNorm):
    method __init__ (line 31) | def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch...
    method forward (line 34) | def forward(self, x):
  class RMSNorm (line 44) | class RMSNorm(nn.Module):
    method __init__ (line 45) | def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=Tru...
    method _norm (line 56) | def _norm(self, x):
    method forward (line 59) | def forward(self, x):
    method extra_repr (line 68) | def extra_repr(self) -> str:
  class ConvRMSNorm (line 71) | class ConvRMSNorm(RMSNorm):
    method __init__ (line 72) | def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=Tru...
    method forward (line 75) | def forward(self, x):
  function apply_parametrization_norm (line 92) | def apply_parametrization_norm(module: nn.Module, norm: str = 'none') ->...
  function get_norm_module (line 104) | def get_norm_module(module: nn.Module, causal: bool = False, norm: str =...
  function get_extra_padding_for_conv1d (line 120) | def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stri...
  function pad1d (line 129) | def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'ze...
  function unpad1d (line 147) | def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
  class NormConv1d (line 156) | class NormConv1d(nn.Module):
    method __init__ (line 158) | def __init__(self, *args, causal: bool = False, norm: str = 'none',
    method forward (line 165) | def forward(self, x):
  class NormConvTranspose1d (line 170) | class NormConvTranspose1d(nn.Module):
    method __init__ (line 172) | def __init__(self, *args, causal: bool = False, norm: str = 'none',
    method forward (line 179) | def forward(self, x):
  class VibeVoiceTokenizerStreamingCache (line 184) | class VibeVoiceTokenizerStreamingCache:
    method __init__ (line 186) | def __init__(self):
    method get (line 189) | def get(self, layer_id: str, sample_indices: torch.Tensor) -> Optional...
    method set (line 219) | def set(self, layer_id: str, sample_indices: torch.Tensor, states: tor...
    method set_to_zero (line 225) | def set_to_zero(self, sample_indices: torch.Tensor):
    method clear (line 234) | def clear(self, layer_id: Optional[str] = None, sample_indices: Option...
  class SConv1d (line 249) | class SConv1d(nn.Module):
    method __init__ (line 251) | def __init__(self, in_channels: int, out_channels: int,
    method layer_id (line 282) | def layer_id(self):
    method forward (line 287) | def forward(self, x: torch.Tensor,
    method _forward_streaming (line 318) | def _forward_streaming(self, x: torch.Tensor,
    method _forward_non_streaming (line 375) | def _forward_non_streaming(self, x: torch.Tensor, debug: bool = False)...
  class SConvTranspose1d (line 411) | class SConvTranspose1d(nn.Module):
    method __init__ (line 413) | def __init__(self, in_channels: int, out_channels: int,
    method layer_id (line 443) | def layer_id(self):
    method forward (line 448) | def forward(self, x: torch.Tensor,
    method _forward_streaming (line 468) | def _forward_streaming(self, x: torch.Tensor,
    method _forward_non_streaming (line 541) | def _forward_non_streaming(self, x: torch.Tensor, debug: bool = False)...
  class FFN (line 569) | class FFN(nn.Module):
    method __init__ (line 570) | def __init__(self,
    method forward (line 580) | def forward(self, x):
  class Convlayer (line 586) | class Convlayer(nn.Module):
    method __init__ (line 587) | def __init__(self,
    method forward (line 603) | def forward(self, x):
  class Block1D (line 606) | class Block1D(nn.Module):
    method __init__ (line 607) | def __init__(self, dim, kernel_size=7, drop_path=0., mixer_layer='conv',
    method forward (line 647) | def forward(self, x):
  class TokenizerEncoder (line 674) | class TokenizerEncoder(nn.Module):
    method __init__ (line 681) | def __init__(self, config):
    method forward_features (line 769) | def forward_features(self, x, cache=None, sample_indices=None, use_cac...
    method forward (line 809) | def forward(self, x, cache=None, sample_indices=None, use_cache=False,...
  class TokenizerDecoder (line 814) | class TokenizerDecoder(nn.Module):
    method __init__ (line 821) | def __init__(self, config):
    method forward_features (line 920) | def forward_features(self, x, cache=None, sample_indices=None, use_cac...
    method forward (line 960) | def forward(self, x, cache=None, sample_indices=None, use_cache=False,...
  class VibeVoiceTokenizerEncoderOutput (line 966) | class VibeVoiceTokenizerEncoderOutput:
    method sample (line 977) | def sample(self, dist_type='fix'):
    method kl (line 1010) | def kl(self):
    method mode (line 1015) | def mode(self):
  class VibeVoiceAcousticTokenizerModel (line 1019) | class VibeVoiceAcousticTokenizerModel(nn.Module):
    method __init__ (line 1029) | def __init__(self, config, dtype: torch.dtype = torch.bfloat16):
    method _init_weights (line 1087) | def _init_weights(self, module):
    method encode (line 1102) | def encode(self, audio, cache=None, sample_indices=None, use_cache=Fal...
    method sampling (line 1108) | def sampling(self, encoder_output, dist_type=None):
    method decode (line 1120) | def decode(self, latents, cache=None, sample_indices=None, use_cache=F...
    method forward (line 1130) | def forward(self, audio, cache=None, sample_indices=None, use_cache=Fa...
  class VibeVoiceSemanticTokenizerModel (line 1137) | class VibeVoiceSemanticTokenizerModel(nn.Module):
    method __init__ (line 1147) | def __init__(self, config, dtype: torch.dtype = torch.bfloat16):
    method _init_weights (line 1180) | def _init_weights(self, module):
    method encode (line 1195) | def encode(self, audio, cache=None, sample_indices=None, use_cache=Fal...
    method sampling (line 1201) | def sampling(self, encoder_output, dist_type=None):
    method forward (line 1205) | def forward(self, audio, cache=None, sample_indices=None, use_cache=Fa...

FILE: vibevoice/modular/streamer.py
  class AudioStreamer (line 13) | class AudioStreamer(BaseStreamer):
    method __init__ (line 27) | def __init__(
    method put (line 42) | def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor):
    method end (line 57) | def end(self, sample_indices: Optional[torch.Tensor] = None):
    method __iter__ (line 78) | def __iter__(self):
    method get_stream (line 82) | def get_stream(self, sample_idx: int):
  class AudioSampleIterator (line 89) | class AudioSampleIterator:
    method __init__ (line 92) | def __init__(self, streamer: AudioStreamer, sample_idx: int):
    method __iter__ (line 96) | def __iter__(self):
    method __next__ (line 99) | def __next__(self):
  class AudioBatchIterator (line 106) | class AudioBatchIterator:
    method __init__ (line 109) | def __init__(self, streamer: AudioStreamer):
    method __iter__ (line 113) | def __iter__(self):
    method __next__ (line 116) | def __next__(self):
  class AsyncAudioStreamer (line 150) | class AsyncAudioStreamer(AudioStreamer):
    method __init__ (line 155) | def __init__(
    method put (line 166) | def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor):
    method end (line 176) | def end(self, sample_indices: Optional[torch.Tensor] = None):
    method get_stream (line 190) | async def get_stream(self, sample_idx: int):
    method __aiter__ (line 201) | def __aiter__(self):
  class AsyncAudioBatchIterator (line 206) | class AsyncAudioBatchIterator:
    method __init__ (line 209) | def __init__(self, streamer: AsyncAudioStreamer):
    method __aiter__ (line 213) | def __aiter__(self):
    method __anext__ (line 216) | async def __anext__(self):
    method _get_chunk (line 262) | async def _get_chunk(self, idx):

FILE: vibevoice/processor/vibevoice_processor.py
  class VibeVoiceProcessor (line 19) | class VibeVoiceProcessor:
    method __init__ (line 37) | def __init__(self, tokenizer=None, audio_processor=None, speech_tok_co...
    method from_pretrained (line 46) | def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
    method save_pretrained (line 130) | def save_pretrained(self, save_directory: Union[str, os.PathLike], **k...
    method __call__ (line 164) | def __call__(
    method _process_single (line 247) | def _process_single(
    method _batch_encode (line 307) | def _batch_encode(
    method _create_voice_prompt (line 407) | def _create_voice_prompt(
    method prepare_speech_inputs (line 462) | def prepare_speech_inputs(
    method _convert_json_to_script (line 512) | def _convert_json_to_script(self, json_file: str) -> str:
    method _convert_text_to_script (line 559) | def _convert_text_to_script(self, text_file: str) -> str:
    method _parse_script (line 597) | def _parse_script(self, script: str) -> List[Tuple[int, str]]:
    method _merge_inputs (line 634) | def _merge_inputs(self, text_inputs: BatchEncoding, audio_inputs: Dict...
    method batch_decode (line 647) | def batch_decode(self, *args, **kwargs):
    method decode (line 654) | def decode(self, *args, **kwargs):
    method model_input_names (line 662) | def model_input_names(self):
    method save_audio (line 670) | def save_audio(self, audio: Union[torch.Tensor, np.ndarray, List[Union...

FILE: vibevoice/processor/vibevoice_tokenizer_processor.py
  class AudioNormalizer (line 17) | class AudioNormalizer:
    method __init__ (line 25) | def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6):
    method tailor_dB_FS (line 36) | def tailor_dB_FS(self, audio: np.ndarray) -> tuple:
    method avoid_clipping (line 51) | def avoid_clipping(self, audio: np.ndarray, scalar: Optional[float] = ...
    method __call__ (line 71) | def __call__(self, audio: np.ndarray) -> np.ndarray:
  class VibeVoiceTokenizerProcessor (line 89) | class VibeVoiceTokenizerProcessor(FeatureExtractionMixin):
    method __init__ (line 106) | def __init__(
    method _ensure_mono (line 133) | def _ensure_mono(self, audio: np.ndarray) -> np.ndarray:
    method _process_single_audio (line 161) | def _process_single_audio(self, audio: Union[np.ndarray, List[float]])...
    method __call__ (line 186) | def __call__(
    method _load_audio_from_path (line 269) | def _load_audio_from_path(self, audio_path: str) -> np.ndarray:
    method preprocess_audio (line 309) | def preprocess_audio(
    method to_dict (line 344) | def to_dict(self) -> Dict[str, Any]:
    method save_audio (line 350) | def save_audio(
    method _prepare_audio_for_save (line 457) | def _prepare_audio_for_save(self, audio: np.ndarray, normalize: bool) ...

FILE: vibevoice/schedule/dpm_solver.py
  function betas_for_alpha_bar (line 28) | def betas_for_alpha_bar(
  function rescale_zero_terminal_snr (line 87) | def rescale_zero_terminal_snr(betas):
  class DPMSolverMultistepScheduler (line 122) | class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    method __init__ (line 203) | def __init__(
    method step_index (line 298) | def step_index(self):
    method begin_index (line 305) | def begin_index(self):
    method set_begin_index (line 311) | def set_begin_index(self, begin_index: int = 0):
    method set_timesteps (line 321) | def set_timesteps(
    method _threshold_sample (line 426) | def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
    method _sigma_to_t (line 460) | def _sigma_to_t(self, sigma, log_sigmas):
    method _sigma_to_alpha_sigma_t (line 483) | def _sigma_to_alpha_sigma_t(self, sigma):
    method _convert_to_karras (line 490) | def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_st...
    method _convert_to_lu (line 515) | def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps...
    method convert_model_output (line 528) | def convert_model_output(
    method dpm_solver_first_order_update (line 627) | def dpm_solver_first_order_update(
    method multistep_dpm_solver_second_order_update (line 696) | def multistep_dpm_solver_second_order_update(
    method multistep_dpm_solver_third_order_update (line 819) | def multistep_dpm_solver_third_order_update(
    method index_for_timestep (line 904) | def index_for_timestep(self, timestep, schedule_timesteps=None):
    method _init_step_index (line 923) | def _init_step_index(self, timestep):
    method step (line 935) | def step(
    method add_noise (line 1024) | def add_noise(
    method get_velocity (line 1046) | def get_velocity(self, original_samples: torch.Tensor, noise: torch.Te...
    method __len__ (line 1064) | def __len__(self):

FILE: vibevoice/schedule/timestep_sampler.py
  class UniformSampler (line 4) | class UniformSampler:
    method __init__ (line 5) | def __init__(self, timesteps: int = 1000):
    method sample (line 8) | def sample(self, batch_size, device):
  class LogitNormalSampler (line 11) | class LogitNormalSampler:
    method __init__ (line 12) | def __init__(self, timesteps: int = 1000, m: int = 0, s: int = 1):
    method sample (line 18) | def sample(self, batch_size: int, device: torch.device):

FILE: vibevoice/scripts/convert_nnscaler_checkpoint_to_transformers.py
  function convert_vibevoice_nnscaler_checkpoint_to_hf (line 20) | def convert_vibevoice_nnscaler_checkpoint_to_hf(
  function main (line 133) | def main():

FILE: vibevoice/training/dataset.py
  function _resample_if_needed (line 21) | def _resample_if_needed(wav: np.ndarray, orig_sr: int, target_sr: int) -...
  class VibeVoiceDataset (line 36) | class VibeVoiceDataset:
    method __init__ (line 37) | def __init__(
    method __len__ (line 49) | def __len__(self) -> int:
    method __getitem__ (line 52) | def __getitem__(self, idx: int) -> Dict[str, Any]:
  function _apply_silence_with_crossfade (line 101) | def _apply_silence_with_crossfade(
  function _load_audio_to_24k (line 164) | def _load_audio_to_24k(
  class VibeVoiceCollator (line 195) | class VibeVoiceCollator:
    method __call__ (line 209) | def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, Any]:

FILE: vibevoice/training/fake_trainer.py
  class FakeTrainer (line 13) | class FakeTrainer(Trainer):
    method __init__ (line 14) | def __init__(self,
    method _train (line 21) | def _train(self):
    method mock_save_model (line 192) | def mock_save_model(self, epoch_no: int, steps: int, is_final: bool = ...
    method training_cleanup (line 218) | def training_cleanup(self):

FILE: vibevoice/training/summary_visitor.py
  class SummaryVisitor (line 6) | class SummaryVisitor(TrainerVisitor):
    method __init__ (line 17) | def __init__(self, log_prefix: str = None, step_loss_interval: int = 1...
    method visit_training_begin (line 27) | def visit_training_begin(self, timestamp: float, batch_size: int, tota...
    method visit_training_end (line 45) | def visit_training_end(self, timestamp: float, loss: float, diffusion_...
    method visit_step_begin (line 67) | def visit_step_begin(self, timestamp: float, step: int, epoch: int, st...
    method visit_step_end (line 72) | def visit_step_end(self, timestamp: float, step: int, epoch: int, step...
    method visit_epoch_begin (line 98) | def visit_epoch_begin(self, timestamp: float, epoch: int, lr: float):
    method visit_epoch_end (line 107) | def visit_epoch_end(self, timestamp: float, epoch: int, epoch_elapsed:...
    method visit_training_failed (line 126) | def visit_training_failed(self, timestamp, error_msg):
    method visit_lora_file_saved (line 129) | def visit_lora_file_saved(self, lora_file: str):
    method visit_final_lora_file_saved (line 132) | def visit_final_lora_file_saved(self, lora_file: str):

FILE: vibevoice/training/trainer.py
  class TrainConfig (line 28) | class TrainConfig:
    method from_dict (line 57) | def from_dict(cls, config_dict: dict) -> "TrainConfig":
    method from_toml (line 88) | def from_toml(cls, toml_path: str) -> "TrainConfig":
    method to_metadata (line 94) | def to_metadata(self) -> Dict[str, Any]:
    method to_dict (line 122) | def to_dict(self) -> Dict[str, Any]:
  class Trainer (line 125) | class Trainer(ABC):
    method __init__ (line 127) | def __init__(self, train_config: TrainConfig, visitor: Optional[Traine...
    method train (line 133) | def train(self):
    method _train (line 137) | def _train(self):
    method training_cleanup (line 141) | def training_cleanup(self):
  class VibeVoiceTrainer (line 144) | class VibeVoiceTrainer(Trainer):
    method __init__ (line 146) | def __init__(self, train_config: TrainConfig, visitor: Optional[Traine...
    method _train (line 162) | def _train(self):
    method save_model (line 337) | def save_model(self, metadata: Dict[str, str], network: LoRANetwork, s...
    method _preprocess_inputs (line 346) | def _preprocess_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
    method _to_device (line 372) | def _to_device(self, inputs: dict) -> dict:
    method get_model_config (line 378) | def get_model_config(self) -> dict:
    method _load_model (line 396) | def _load_model(self, model_file: Path,
    method _get_optimizer (line 421) | def _get_optimizer(self, trainable_params: list[torch.nn.Parameter]) -...
    method _get_dataloader (line 478) | def _get_dataloader(self, processor: VibeVoiceProcessor, model: VibeVo...
    method _patch_acoustic_encode_for_legacy_indexing (line 513) | def _patch_acoustic_encode_for_legacy_indexing(self, model_obj):
    method training_cleanup (line 548) | def training_cleanup(self):

FILE: vibevoice/training/trainer_visitor.py
  class TrainerVisitor (line 4) | class TrainerVisitor(ABC):
    method visit_training_begin (line 6) | def visit_training_begin(self, timestamp: float, batch_size: int, tota...
    method visit_training_end (line 10) | def visit_training_end(self, timestamp: float, loss: float, diffusion_...
    method visit_step_begin (line 14) | def visit_step_begin(self, timestamp: float, step: int, epoch: int, st...
    method visit_step_end (line 18) | def visit_step_end(self, timestamp: float, step: int, epoch: int, step...
    method visit_epoch_begin (line 22) | def visit_epoch_begin(self, timestamp: float, epoch: int, lr: float):
    method visit_epoch_end (line 26) | def visit_epoch_end(self, timestamp: float, epoch: int, epoch_elapsed:...
    method visit_training_failed (line 30) | def visit_training_failed(self, timestamp: float, error_msg: str):
    method visit_lora_file_saved (line 34) | def visit_lora_file_saved(self, lora_file: str):
    method visit_final_lora_file_saved (line 38) | def visit_final_lora_file_saved(self, lora_file: str):
  class VisitorManager (line 42) | class VisitorManager(TrainerVisitor):
    method __init__ (line 43) | def __init__(self):
    method register_visitor (line 46) | def register_visitor(self, visitor: TrainerVisitor):
    method visit_training_begin (line 49) | def visit_training_begin(self, timestamp: float, batch_size: int, tota...
    method visit_training_end (line 53) | def visit_training_end(self, timestamp: float, loss: float, diffusion_...
    method visit_step_begin (line 57) | def visit_step_begin(self, timestamp: float, step: int, epoch: int, st...
    method visit_step_end (line 61) | def visit_step_end(self, timestamp: float, step: int, epoch: int, step...
    method visit_epoch_begin (line 65) | def visit_epoch_begin(self, timestamp: float, epoch: int, lr: float):
    method visit_epoch_end (line 69) | def visit_epoch_end(self, timestamp: float, epoch: int, epoch_elapsed:...
    method visit_training_failed (line 73) | def visit_training_failed(self, timestamp: float, error_msg: str):
    method visit_lora_file_saved (line 77) | def visit_lora_file_saved(self, lora_file: str):
    method visit_final_lora_file_saved (line 81) | def visit_final_lora_file_saved(self, lora_file: str):
Copy disabled (too large) Download .json
Condensed preview — 202 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (13,354K chars).
[
  {
    "path": ".dockerignore",
    "chars": 370,
    "preview": "# This .dockerignore is mostly irrelevant since we don't COPY from local context\n# The Dockerfile clones everything from"
  },
  {
    "path": ".gitignore",
    "chars": 5042,
    "preview": ".claude\nCLAUDE.md\n.vscode/\nuv.lock\nmedia/\n\n\ndemo/example/\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod"
  },
  {
    "path": "CHANGELOG.md",
    "chars": 5751,
    "preview": "# Changelog\n\nAll notable changes to VibeVoice will be documented in this file.\n\nThe format is based on [Keep a Changelog"
  },
  {
    "path": "CHANGELOG_zh.md",
    "chars": 2696,
    "preview": "# 更新日志\n\nVibeVoice 的所有重要更改都将记录在此文件中。\n\n本文档格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/),\n并且本项目遵循 [语义化版本"
  },
  {
    "path": "Dockerfile",
    "chars": 5765,
    "preview": "# Multi-stage Dockerfile for VibeVoice\n# This Dockerfile is completely self-contained and requires no local source code\n"
  },
  {
    "path": "README.md",
    "chars": 27132,
    "preview": "# VibeVoiceFusion\n\n<div align=\"center\">\n\n<img src=\"frontend/public/icon-rect-pulse.svg\" alt=\"VibeVoiceFusion Logo\" width"
  },
  {
    "path": "README_zh.md",
    "chars": 16065,
    "preview": "# VibeVoiceFusion\n\n<div align=\"center\">\n\n<img src=\"frontend/public/icon-rect-pulse.svg\" alt=\"VibeVoiceFusion Logo\" width"
  },
  {
    "path": "backend/.gitignore",
    "chars": 5,
    "preview": ".env\n"
  },
  {
    "path": "backend/README.md",
    "chars": 9678,
    "preview": "# VibeVoice Backend\n\nFlask-based REST API backend for VibeVoice speech generation system.\n\n## Architecture\n\n```\nbackend/"
  },
  {
    "path": "backend/__init__.py",
    "chars": 105,
    "preview": "\"\"\"\nVibeVoice Backend API\nFlask-based backend for VibeVoice speech generation\n\"\"\"\n\n__version__ = \"0.0.1\"\n"
  },
  {
    "path": "backend/api/__init__.py",
    "chars": 823,
    "preview": "\"\"\"\nAPI Blueprint for VibeVoice backend\n\"\"\"\nfrom flask import Blueprint, jsonify\n\n# Create main API blueprint\napi_bp = B"
  },
  {
    "path": "backend/api/datasets.py",
    "chars": 21948,
    "preview": "\"\"\"\nDatasets API endpoints (project-scoped)\n\"\"\"\nfrom flask import request, jsonify, current_app, send_file\nfrom pathlib "
  },
  {
    "path": "backend/api/dialog_sessions.py",
    "chars": 10305,
    "preview": "\"\"\"\nDialog Sessions API endpoints\n\"\"\"\nfrom flask import request, jsonify, current_app, send_file\nfrom backend.api import"
  },
  {
    "path": "backend/api/generation.py",
    "chars": 27423,
    "preview": "\"\"\"\nGeneration API endpoints\n\"\"\"\nfrom uuid import uuid4\nfrom typing import Dict, Any\nfrom flask import request, jsonify,"
  },
  {
    "path": "backend/api/openai_compat.py",
    "chars": 5861,
    "preview": "\"\"\"\nOpenAI-Compatible TTS API endpoint\n\nImplements POST /v1/audio/speech for drop-in compatibility with OpenAI TTS clien"
  },
  {
    "path": "backend/api/preset_voices.py",
    "chars": 8470,
    "preview": "\"\"\"\nPreset voices API endpoints\n\nFilename convention: {language}-{name}_{gender}[_bgm].wav\nThe filename serves as the un"
  },
  {
    "path": "backend/api/projects.py",
    "chars": 6784,
    "preview": "\"\"\"\nProjects API endpoints\n\"\"\"\nimport re\nfrom flask import request, jsonify, current_app\nfrom backend.api import api_bp\n"
  },
  {
    "path": "backend/api/quick_generate.py",
    "chars": 16214,
    "preview": "\"\"\"\nQuick Generate API endpoints\n\"\"\"\nimport json\nfrom flask import request, jsonify, current_app, send_file\nfrom werkzeu"
  },
  {
    "path": "backend/api/speakers.py",
    "chars": 13143,
    "preview": "\"\"\"\nSpeakers API endpoints\n\"\"\"\nfrom flask import request, jsonify, current_app, send_file\nfrom backend.api import api_bp"
  },
  {
    "path": "backend/api/tasks.py",
    "chars": 5231,
    "preview": "\"\"\"\nUnified Tasks API endpoints\nProvides a single endpoint to check all running tasks (inference, training, and quick ge"
  },
  {
    "path": "backend/api/training.py",
    "chars": 16321,
    "preview": "\"\"\"\nTraining API endpoints\n\"\"\"\nfrom typing import Dict, Any\nfrom flask import request, jsonify, current_app, send_file\nf"
  },
  {
    "path": "backend/app.py",
    "chars": 4522,
    "preview": "\"\"\"\nFlask application factory for VibeVoice backend\n\"\"\"\nimport os\nfrom flask import Flask, jsonify, send_from_directory\n"
  },
  {
    "path": "backend/config.py",
    "chars": 2275,
    "preview": "\"\"\"\nConfiguration management for VibeVoice backend\n\"\"\"\nimport os\nfrom pathlib import Path\nfrom dataclasses import datacl"
  },
  {
    "path": "backend/i18n/__init__.py",
    "chars": 3942,
    "preview": "\"\"\"\ni18n utilities for backend API\n\"\"\"\nimport json\nfrom pathlib import Path\nfrom flask import request, g\nfrom functools "
  },
  {
    "path": "backend/i18n/en.json",
    "chars": 5011,
    "preview": "{\n  \"errors\": {\n    \"bad_request\": \"Bad Request\",\n    \"not_found\": \"Not Found\",\n    \"resource_not_found\": \"The requested"
  },
  {
    "path": "backend/i18n/zh.json",
    "chars": 3387,
    "preview": "{\n  \"errors\": {\n    \"bad_request\": \"错误的请求\",\n    \"not_found\": \"未找到\",\n    \"resource_not_found\": \"未找到请求的资源\",\n    \"internal_"
  },
  {
    "path": "backend/inference/inference.py",
    "chars": 24304,
    "preview": "import base64\nimport copy\nfrom datetime import datetime\nimport gc\nimport random\nfrom typing import Union\nimport time\nimp"
  },
  {
    "path": "backend/inference/quick_generate_inference.py",
    "chars": 21074,
    "preview": "\"\"\"\nQuick Generate Inference Engine - handles voice generation without project setup\n\"\"\"\nimport base64\nimport copy\nimpor"
  },
  {
    "path": "backend/run.py",
    "chars": 1547,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nDevelopment server for VibeVoice backend\n\nUsage:\n    python backend/run.py\n    or\n    python "
  },
  {
    "path": "backend/scripts/generate_cantonese_training_dataset.py",
    "chars": 12388,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nGenerate training dataset for VibeVoice from ASR-SCCantDuSC Cantonese audio files and metadat"
  },
  {
    "path": "backend/scripts/generate_mcv_cantonese_training_dataset.py",
    "chars": 14037,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nGenerate training dataset for VibeVoice from Mozilla Common Voice Cantonese (yue) audio files"
  },
  {
    "path": "backend/scripts/generate_training_dataset.py",
    "chars": 13777,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nGenerate training dataset for VibeVoice from audio files and metadata.\n\nThis script processes"
  },
  {
    "path": "backend/scripts/migrate_dataset_paths.py",
    "chars": 3940,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nMigration script to fix audio and voice_prompts paths in existing datasets.jsonl files.\n\nThis"
  },
  {
    "path": "backend/services/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "backend/services/dataset_service.py",
    "chars": 33203,
    "preview": "\"\"\"\nDataset management service - handles business logic for datasets\n\"\"\"\nimport uuid\nimport json\nimport shutil\nfrom path"
  },
  {
    "path": "backend/services/dialog_session_service.py",
    "chars": 17468,
    "preview": "\"\"\"\nDialog session management service - handles business logic for dialog sessions\n\"\"\"\nimport uuid\nimport re\n\nfrom typin"
  },
  {
    "path": "backend/services/openai_compat_service.py",
    "chars": 9081,
    "preview": "\"\"\"\nOpenAI-Compatible TTS Service\n\nProvides a synchronous wrapper over VibeVoice's async quick generation engine,\nimplem"
  },
  {
    "path": "backend/services/preset_voice_service.py",
    "chars": 11324,
    "preview": "\"\"\"\nPreset voice management service - metadata parsed from filenames\n\nFilename convention: {language}-{name}_{gender}[_b"
  },
  {
    "path": "backend/services/project_service.py",
    "chars": 6691,
    "preview": "\"\"\"\nProject management service - handles business logic for projects\n\"\"\"\nimport uuid\nfrom pathlib import Path\nfrom typin"
  },
  {
    "path": "backend/services/quick_generate_service.py",
    "chars": 15196,
    "preview": "\"\"\"\nQuick Generate Service - handles voice generation without project setup\n\"\"\"\nimport json\nimport shutil\nfrom datetime "
  },
  {
    "path": "backend/services/speaker_service.py",
    "chars": 17071,
    "preview": "\"\"\"\nSpeaker role management service - handles business logic for speaker roles\n\"\"\"\nimport uuid\nimport wave\nimport shutil"
  },
  {
    "path": "backend/services/training_service.py",
    "chars": 15334,
    "preview": "from flask import current_app\n\nfrom pathlib import Path\nfrom uuid import uuid4\nfrom typing import List, Optional\nfrom da"
  },
  {
    "path": "backend/services/voice_gerneration_service.py",
    "chars": 8001,
    "preview": "import threading\n\nfrom typing import List, Dict, Any, Optional\nfrom pathlib import Path\nfrom utils.file_handler import F"
  },
  {
    "path": "backend/task_manager/inference_task.py",
    "chars": 3351,
    "preview": "from backend.task_manager.task import FAILURE_TYPE_GENERAL, Task\nfrom typing import Any, Dict, List\nfrom backend.inferen"
  },
  {
    "path": "backend/task_manager/quick_generate_task.py",
    "chars": 3348,
    "preview": "\"\"\"\nQuick Generate Task - wraps quick generate inference for task manager\n\"\"\"\nfrom pathlib import Path\nfrom typing impor"
  },
  {
    "path": "backend/task_manager/task.py",
    "chars": 2767,
    "preview": "import threading\n\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Dict, List\n\nfrom time import sleep\n\nimport"
  },
  {
    "path": "backend/task_manager/training_task.py",
    "chars": 1359,
    "preview": "\nfrom datetime import datetime\nfrom backend.task_manager.task import FAILURE_TYPE_GENERAL, Task\nfrom backend.training.en"
  },
  {
    "path": "backend/training/engine.py",
    "chars": 8275,
    "preview": "\nfrom datetime import datetime, timezone\nfrom backend.training.state import TrainingState, TrainingStateWriter\n\nfrom vib"
  },
  {
    "path": "backend/training/state.py",
    "chars": 4056,
    "preview": "from dataclasses import dataclass, asdict, field\nfrom datetime import datetime, timezone\nfrom pathlib import Path\n\nfrom "
  },
  {
    "path": "backend/utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "backend/utils/dialog_validator.py",
    "chars": 7456,
    "preview": "\"\"\"\nDialog text parser and validator utilities\n\"\"\"\nimport re\nfrom typing import List, Set, Tuple, Optional\nfrom pathlib "
  },
  {
    "path": "backend/utils/file_handler.py",
    "chars": 4496,
    "preview": "\"\"\"\nFile and directory handling utilities\n\"\"\"\nimport json\nimport shutil\nfrom datetime import datetime\nfrom pathlib impor"
  },
  {
    "path": "backend/utils/tensorboard_reader.py",
    "chars": 4685,
    "preview": "\"\"\"\nUtility to read and parse TensorBoard event files\n\"\"\"\nimport os\nfrom typing import Dict, List, Optional\nfrom tensorb"
  },
  {
    "path": "compose.yml",
    "chars": 678,
    "preview": "services:\n  model-downloader:\n    build:\n      target: model-downloader\n      context: .\n    image: vibevoicefusion:mode"
  },
  {
    "path": "config/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "config/configuration_vibevoice.py",
    "chars": 15875,
    "preview": "\"\"\" VibeVoice_AcousticTokenizer model configuration\"\"\"\nfrom enum import Enum\nfrom typing import Dict, List, Optional\n\nfr"
  },
  {
    "path": "demo/README_AUDIO_DENOISE.md",
    "chars": 6521,
    "preview": "# Audio Denoising Tools\n\nThis directory contains two audio denoising scripts using state-of-the-art deep learning models"
  },
  {
    "path": "demo/audio_denoise_deepfilter.py",
    "chars": 13857,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nAudio Denoising Script using DeepFilterNet.\n\nDeepFilterNet is a state-of-the-art deep learnin"
  },
  {
    "path": "demo/audio_denose.py",
    "chars": 9517,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nAudio Denoising Script using SpeechBrain SepFormer model.\n\nThis script uses the speechbrain/s"
  },
  {
    "path": "demo/convert_model.py",
    "chars": 2250,
    "preview": "import argparse\nimport os\n\nimport torch\n\nfrom safetensors.torch import save_model\n\nfrom config.configuration_vibevoice i"
  },
  {
    "path": "demo/list_modules.py",
    "chars": 990,
    "preview": "import torch\nfrom torch import nn\nfrom util.safetensors_util import MemoryEfficientSafeOpen\nfrom config.configuration_vi"
  },
  {
    "path": "demo/local_file_inference.py",
    "chars": 14513,
    "preview": "import argparse\nimport os\nimport re\nfrom typing import List, Tuple\nimport time\nimport torch\n\nfrom vibevoice.modular.mode"
  },
  {
    "path": "demo/train.py",
    "chars": 812,
    "preview": "import argparse\nfrom uuid import uuid4\nfrom vibevoice.training.trainer import TrainConfig, VibeVoiceTrainer\nfrom vibevoi"
  },
  {
    "path": "demo/verify_dataset.py",
    "chars": 12838,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nVerify generated dataset for VibeVoice training.\n\nThis script validates the generated dataset"
  },
  {
    "path": "demo/view_tensorfile.py",
    "chars": 6739,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nTool to inspect safetensors files.\nCan display metadata and list tensor keys with optional re"
  },
  {
    "path": "demo/vram_offload_animation.py",
    "chars": 31765,
    "preview": "\"\"\"\nVRAM Layer Offloading Animation for VibeVoice\n\nDemonstrates the layer offloading mechanism during inference:\n- Layer"
  },
  {
    "path": "docs/APIs.md",
    "chars": 52812,
    "preview": "# VibeVoice API Documentation\n\nThis document provides complete API documentation for VibeVoice backend services.\n\n## Bas"
  },
  {
    "path": "docs/DATASET_PATH_FIX.md",
    "chars": 4114,
    "preview": "# Dataset Path Format Fix\n\n## Problem\nThe dataset management system was storing bare filenames in the `audio` and `voice"
  },
  {
    "path": "docs/DOCKER_REBUILD.md",
    "chars": 4209,
    "preview": "# Docker Rebuild Workflow for Code Updates\n\n## Problem\n\nSince this Dockerfile clones code from GitHub (no local context "
  },
  {
    "path": "docs/develop_thoughts.md",
    "chars": 12198,
    "preview": "\n## VibeVoiceFusion 开发感悟\n\n写在前面的话, 这篇文章主要是我开发[VibeVoiceFusion](https://github.com/zhao-kun/VibeVoiceFusion) 这个项目的一些体会, 经验"
  },
  {
    "path": "docs/model_components_analysis.md",
    "chars": 19343,
    "preview": "# VibeVoice Model Components Analysis\n\n## Overview\n\nThis document provides a detailed analysis of the VibeVoice model co"
  },
  {
    "path": "docs/multi-generation-ui-design.md",
    "chars": 17654,
    "preview": "# Multi-Generation Frontend UI Design\n\nThis document outlines the frontend UI design for supporting multiple voice gener"
  },
  {
    "path": "docs/offloading.md",
    "chars": 19425,
    "preview": "# Layer Offloading for VRAM Optimization\n\n## Table of Contents\n- [Overview](#overview)\n- [Architecture](#architecture)\n-"
  },
  {
    "path": "docs/openai-compatible-api.md",
    "chars": 12778,
    "preview": "# OpenAI-Compatible TTS API Design Document\n\n## Overview\n\nThis document describes the design of an OpenAI-compatible Tex"
  },
  {
    "path": "docs/preset-voice-feature.md",
    "chars": 9506,
    "preview": "# Preset Voice Feature\n\nThis document describes the preset voice feature that allows users to quickly create speakers us"
  },
  {
    "path": "docs/processor.md",
    "chars": 9955,
    "preview": "# VibeVoiceProcessor Documentation\n\n## Overview\n\nThe `VibeVoiceProcessor` is a unified processor that combines text toke"
  },
  {
    "path": "docs/quick-generate-feature.md",
    "chars": 23463,
    "preview": "# Quick Generate Feature Design\n\n## Problem Statement\n\nThe current VibeVoice workflow requires users to:\n\n1. Create a pr"
  },
  {
    "path": "docs/vibevoice_inference_architecture.md",
    "chars": 73285,
    "preview": "# VibeVoice Inference Architecture Documentation\n\nThis document provides a comprehensive guide to understanding the `Vib"
  },
  {
    "path": "frontend/.gitignore",
    "chars": 480,
    "preview": "# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.\n\n# dependencies\n/node_modules\n/.pn"
  },
  {
    "path": "frontend/README.md",
    "chars": 2166,
    "preview": "# VibeVoice Frontend\n\nThis is the web frontend for VibeVoice, a speech generation model built with Next.js, React, and T"
  },
  {
    "path": "frontend/app/dataset/detail/page.tsx",
    "chars": 16986,
    "preview": "\"use client\";\n\nimport { useEffect, useState, useCallback, Suspense } from \"react\";\nimport { useRouter, useSearchParams }"
  },
  {
    "path": "frontend/app/dataset/page.tsx",
    "chars": 7205,
    "preview": "\"use client\";\n\nimport { useEffect, useState } from \"react\";\nimport { useRouter } from \"next/navigation\";\nimport { usePro"
  },
  {
    "path": "frontend/app/fine-tuning/page.tsx",
    "chars": 5001,
    "preview": "'use client';\n\nimport { useEffect } from 'react';\nimport { useRouter } from 'next/navigation';\nimport { useProject } fro"
  },
  {
    "path": "frontend/app/generate-voice/page.tsx",
    "chars": 5032,
    "preview": "'use client';\n\nimport { useEffect } from 'react';\nimport { useRouter } from 'next/navigation';\nimport { useProject } fro"
  },
  {
    "path": "frontend/app/globals.css",
    "chars": 2058,
    "preview": "@import \"tailwindcss\";\n\n:root {\n  --background: #ffffff;\n  --foreground: #171717;\n}\n\n@theme inline {\n  --color-backgroun"
  },
  {
    "path": "frontend/app/layout.tsx",
    "chars": 1937,
    "preview": "import type { Metadata } from \"next\";\nimport { Geist, Geist_Mono } from \"next/font/google\";\nimport \"./globals.css\";\nimpo"
  },
  {
    "path": "frontend/app/page.tsx",
    "chars": 126,
    "preview": "import ProjectSelector from \"@/components/ProjectSelector\";\n\nexport default function Home() {\n  return <ProjectSelector "
  },
  {
    "path": "frontend/app/quick-generate/page.tsx",
    "chars": 57876,
    "preview": "'use client';\n\nimport React, { useState, useRef, useEffect, useCallback, Suspense } from 'react';\nimport { useRouter, us"
  },
  {
    "path": "frontend/app/speaker-role/page.tsx",
    "chars": 2536,
    "preview": "\"use client\";\n\nimport { useEffect } from \"react\";\nimport { useRouter } from \"next/navigation\";\nimport { useProject } fro"
  },
  {
    "path": "frontend/app/voice-editor/page.tsx",
    "chars": 9848,
    "preview": "\"use client\";\n\nimport { useState, useEffect, useMemo, Suspense } from \"react\";\nimport { useRouter, useSearchParams } fro"
  },
  {
    "path": "frontend/components/AudioPlayer.tsx",
    "chars": 12857,
    "preview": "\"use client\";\n\nimport React, { useRef, useState, useEffect } from \"react\";\nimport WaveSurfer from \"wavesurfer.js\";\nimpor"
  },
  {
    "path": "frontend/components/AudioUploader.tsx",
    "chars": 3921,
    "preview": "\"use client\";\n\nimport React, { useRef, useState } from \"react\";\nimport { ACCEPTED_AUDIO_TYPES, ACCEPTED_AUDIO_EXTENSIONS"
  },
  {
    "path": "frontend/components/CreateDatasetModal.tsx",
    "chars": 3957,
    "preview": "\"use client\";\n\nimport { useState } from \"react\";\nimport { useLanguage } from \"@/lib/i18n/LanguageContext\";\n\ninterface Cr"
  },
  {
    "path": "frontend/components/CurrentGeneration.tsx",
    "chars": 28357,
    "preview": "'use client';\n\nimport React, { useState } from 'react';\nimport { useGeneration } from '@/lib/GenerationContext';\nimport "
  },
  {
    "path": "frontend/components/CurrentTraining.tsx",
    "chars": 26356,
    "preview": "'use client';\n\nimport React, { useState } from 'react';\nimport { useTraining } from '@/lib/TrainingContext';\nimport { us"
  },
  {
    "path": "frontend/components/DatasetCard.tsx",
    "chars": 10232,
    "preview": "\"use client\";\n\nimport { useState } from \"react\";\nimport { useLanguage } from \"@/lib/i18n/LanguageContext\";\nimport Import"
  },
  {
    "path": "frontend/components/DatasetItemModal.tsx",
    "chars": 9619,
    "preview": "\"use client\";\n\nimport { useState, useRef } from \"react\";\nimport { useLanguage } from \"@/lib/i18n/LanguageContext\";\nimpor"
  },
  {
    "path": "frontend/components/DatasetItemRow.tsx",
    "chars": 6018,
    "preview": "\"use client\";\n\nimport { useState } from \"react\";\nimport { useLanguage } from \"@/lib/i18n/LanguageContext\";\nimport Inline"
  },
  {
    "path": "frontend/components/DialogEditor.tsx",
    "chars": 14643,
    "preview": "\"use client\";\n\nimport { useState, useEffect } from \"react\";\nimport { DialogLine, SpeakerInfo, SessionMode } from \"@/type"
  },
  {
    "path": "frontend/components/DialogPreview.tsx",
    "chars": 4271,
    "preview": "\"use client\";\n\nimport { DialogLine, SpeakerInfo } from \"@/types/dialog\";\nimport { useRef } from \"react\";\nimport { useLan"
  },
  {
    "path": "frontend/components/GenerationForm.tsx",
    "chars": 23789,
    "preview": "'use client';\n\nimport React, { useState, useEffect } from 'react';\nimport { useSession } from '@/lib/SessionContext';\nim"
  },
  {
    "path": "frontend/components/GenerationHistory.tsx",
    "chars": 45915,
    "preview": "'use client';\n\nimport React, { useState, useMemo, useCallback } from 'react';\nimport { useProject } from '@/lib/ProjectC"
  },
  {
    "path": "frontend/components/ImportDatasetModal.tsx",
    "chars": 7215,
    "preview": "\"use client\";\n\nimport { useState, useRef } from \"react\";\nimport { useLanguage } from \"@/lib/i18n/LanguageContext\";\n\ninte"
  },
  {
    "path": "frontend/components/InlineAudioPlayer.tsx",
    "chars": 2951,
    "preview": "\"use client\";\n\nimport { useState, useRef, useEffect } from \"react\";\n\ninterface InlineAudioPlayerProps {\n  audioUrl: stri"
  },
  {
    "path": "frontend/components/LayoutWrapper.tsx",
    "chars": 972,
    "preview": "\"use client\";\n\nimport { usePathname } from \"next/navigation\";\nimport Navigation from \"@/components/Navigation\";\nimport Q"
  },
  {
    "path": "frontend/components/Navigation.tsx",
    "chars": 16691,
    "preview": "\"use client\";\n\nimport Link from \"next/link\";\nimport Image from \"next/image\";\nimport { usePathname, useRouter } from \"nex"
  },
  {
    "path": "frontend/components/PresetVoiceManager.tsx",
    "chars": 22850,
    "preview": "\"use client\";\n\nimport { useState, useEffect, useRef } from \"react\";\nimport { usePresetVoice, PresetVoiceProvider } from "
  },
  {
    "path": "frontend/components/PresetVoiceSelector.tsx",
    "chars": 9771,
    "preview": "\"use client\";\n\nimport React, { useState, useEffect, useRef } from \"react\";\nimport { api } from \"@/lib/api\";\nimport { use"
  },
  {
    "path": "frontend/components/ProjectSelector.tsx",
    "chars": 13323,
    "preview": "\"use client\";\n\nimport { useState, useMemo } from \"react\";\nimport Image from \"next/image\";\nimport { useProject } from \"@/"
  },
  {
    "path": "frontend/components/QuickGenerateHistory.tsx",
    "chars": 20194,
    "preview": "'use client';\n\nimport React, { useState, useCallback, useEffect } from 'react';\nimport { useLanguage } from '@/lib/i18n/"
  },
  {
    "path": "frontend/components/QuickGenerateNavigation.tsx",
    "chars": 9353,
    "preview": "\"use client\";\n\nimport Link from \"next/link\";\nimport Image from \"next/image\";\nimport { useRouter } from \"next/navigation\""
  },
  {
    "path": "frontend/components/SessionManager.tsx",
    "chars": 18329,
    "preview": "\"use client\";\n\nimport { useState } from \"react\";\nimport { useRouter } from \"next/navigation\";\nimport { useSession } from"
  },
  {
    "path": "frontend/components/SpeakerList.tsx",
    "chars": 2811,
    "preview": "\"use client\";\n\nimport { Speaker } from \"@/types/speaker\";\n\ninterface SpeakerListProps {\n  speakers: Speaker[];\n  activeS"
  },
  {
    "path": "frontend/components/SpeakerRoleManager.tsx",
    "chars": 17795,
    "preview": "\"use client\";\n\nimport React, { useState, useEffect } from \"react\";\nimport { useSpeakerRole } from \"@/lib/SpeakerRoleCont"
  },
  {
    "path": "frontend/components/SpeakerSelector.tsx",
    "chars": 2628,
    "preview": "\"use client\";\n\nimport { SpeakerInfo, SessionMode } from \"@/types/dialog\";\nimport { useLanguage } from \"@/lib/i18n/Langua"
  },
  {
    "path": "frontend/components/TextEditor.tsx",
    "chars": 1999,
    "preview": "\"use client\";\n\nimport { Speaker } from \"@/types/speaker\";\n\ninterface TextEditorProps {\n  speaker: Speaker | null;\n  onCo"
  },
  {
    "path": "frontend/components/TrainingForm.tsx",
    "chars": 28184,
    "preview": "'use client';\n\nimport React, { useState } from 'react';\nimport { useTraining } from '@/lib/TrainingContext';\nimport { us"
  },
  {
    "path": "frontend/components/TrainingHistory.tsx",
    "chars": 32013,
    "preview": "'use client';\n\nimport React, { useState, useMemo, useCallback } from 'react';\nimport { useTraining } from '@/lib/Trainin"
  },
  {
    "path": "frontend/components/TrainingMetricsChart.tsx",
    "chars": 16750,
    "preview": "'use client';\n\nimport React, { useEffect, useState } from 'react';\nimport { LineChart, Line, XAxis, YAxis, CartesianGrid"
  },
  {
    "path": "frontend/components/VoicePreview.tsx",
    "chars": 5294,
    "preview": "\"use client\";\n\nimport { Speaker } from \"@/types/speaker\";\nimport { useRef, useState } from \"react\";\n\ninterface VoicePrev"
  },
  {
    "path": "frontend/components/VoiceRecorder.tsx",
    "chars": 12592,
    "preview": "\"use client\";\n\nimport React, { useState, useRef, useEffect } from \"react\";\nimport { convertToWav } from \"@/lib/audioUtil"
  },
  {
    "path": "frontend/eslint.config.mjs",
    "chars": 524,
    "preview": "import { dirname } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport { FlatCompat } from \"@eslint/eslintrc\";\n\ncon"
  },
  {
    "path": "frontend/lib/DatasetContext.tsx",
    "chars": 5151,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect, useCallback } from \"react\";\nimport { api,"
  },
  {
    "path": "frontend/lib/DatasetItemsContext.tsx",
    "chars": 6527,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect, useCallback } from \"react\";\nimport { api,"
  },
  {
    "path": "frontend/lib/GenerationContext.tsx",
    "chars": 10327,
    "preview": "'use client';\n\nimport React, { createContext, useContext, useState, useEffect, useCallback, useRef } from 'react';\nimpor"
  },
  {
    "path": "frontend/lib/GlobalTaskContext.tsx",
    "chars": 1643,
    "preview": "'use client';\n\nimport React, { createContext, useContext, useState, useEffect, useCallback, useRef } from 'react';\nimpor"
  },
  {
    "path": "frontend/lib/PresetVoiceContext.tsx",
    "chars": 5559,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useCallback, ReactNode } from \"react\";\nimport { api "
  },
  {
    "path": "frontend/lib/ProjectContext.tsx",
    "chars": 4706,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect } from \"react\";\nimport { Project, ProjectC"
  },
  {
    "path": "frontend/lib/SessionContext.tsx",
    "chars": 13481,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect } from \"react\";\nimport { DialogSession, Di"
  },
  {
    "path": "frontend/lib/SpeakerRoleContext.tsx",
    "chars": 8861,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect } from \"react\";\nimport { SpeakerRole } fro"
  },
  {
    "path": "frontend/lib/TrainingContext.tsx",
    "chars": 11386,
    "preview": "'use client';\n\nimport React, { createContext, useContext, useState, useEffect, useCallback, useRef } from 'react';\nimpor"
  },
  {
    "path": "frontend/lib/api.ts",
    "chars": 29739,
    "preview": "/**\n * Backend API client for VibeVoice\n */\n\nimport type {\n  CreateGenerationRequest,\n  CreateGenerationResponse,\n  Curr"
  },
  {
    "path": "frontend/lib/audioUtils.ts",
    "chars": 2743,
    "preview": "/**\n * Audio utility functions for converting and encoding audio\n */\n\n/**\n * Convert an audio blob to WAV format\n * @par"
  },
  {
    "path": "frontend/lib/i18n/LanguageContext.tsx",
    "chars": 3446,
    "preview": "\"use client\";\n\nimport React, { createContext, useContext, useState, useEffect, useCallback } from 'react';\nimport type {"
  },
  {
    "path": "frontend/lib/i18n/config.ts",
    "chars": 138,
    "preview": "export const i18n = {\n  defaultLocale: 'en',\n  locales: ['en', 'zh'],\n} as const;\n\nexport type Locale = (typeof i18n)['l"
  },
  {
    "path": "frontend/lib/i18n/locales/en.json",
    "chars": 35508,
    "preview": "{\n  \"common\": {\n    \"loading\": \"Loading...\",\n    \"save\": \"Save\",\n    \"cancel\": \"Cancel\",\n    \"delete\": \"Delete\",\n    \"ed"
  },
  {
    "path": "frontend/lib/i18n/locales/zh.json",
    "chars": 24399,
    "preview": "{\n  \"common\": {\n    \"loading\": \"加载中...\",\n    \"save\": \"保存\",\n    \"cancel\": \"取消\",\n    \"delete\": \"删除\",\n    \"edit\": \"编辑\",\n   "
  },
  {
    "path": "frontend/next.config.ts",
    "chars": 546,
    "preview": "import type { NextConfig } from \"next\";\n\nconst nextConfig: NextConfig = {\n  /* config options here */\n  // Only use expo"
  },
  {
    "path": "frontend/package.json",
    "chars": 791,
    "preview": "{\n  \"name\": \"frontend\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"version\": \"node scripts/generate-ve"
  },
  {
    "path": "frontend/postcss.config.mjs",
    "chars": 81,
    "preview": "const config = {\n  plugins: [\"@tailwindcss/postcss\"],\n};\n\nexport default config;\n"
  },
  {
    "path": "frontend/public/icon-preview.html",
    "chars": 6077,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "frontend/public/icon-rect-preview.html",
    "chars": 8571,
    "preview": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width"
  },
  {
    "path": "frontend/public/site.webmanifest",
    "chars": 1070,
    "preview": "{\n  \"name\": \"VibeVoiceFusion - Multi-Speaker Voice Editor\",\n  \"short_name\": \"VibeVoiceFusion\",\n  \"description\": \"Text-to"
  },
  {
    "path": "frontend/scripts/generate-version.js",
    "chars": 1803,
    "preview": "#!/usr/bin/env node\n/**\n * Generate version string from git describe\n * Format: {tag}-g{commit_id} (e.g., v2.0.0-1-gc834"
  },
  {
    "path": "frontend/tsconfig.json",
    "chars": 598,
    "preview": "{\n  \"compilerOptions\": {\n    \"target\": \"ES2017\",\n    \"lib\": [\"dom\", \"dom.iterable\", \"esnext\"],\n    \"allowJs\": true,\n    "
  },
  {
    "path": "frontend/types/dialog.ts",
    "chars": 788,
    "preview": "export interface DialogLine {\n  id: string;\n  speakerId: string;\n  content: string;\n}\n\nexport interface SpeakerInfo {\n  "
  },
  {
    "path": "frontend/types/generation.ts",
    "chars": 9007,
    "preview": "/**\n * Generation types for Voice Generation API\n */\n\n/**\n * Inference phase enum matching backend InferencePhase\n * Not"
  },
  {
    "path": "frontend/types/preset.ts",
    "chars": 1264,
    "preview": "/**\n * Preset voice types for VibeVoice\n *\n * Preset voices use filename-based metadata.\n * Filename convention: {langua"
  },
  {
    "path": "frontend/types/project.ts",
    "chars": 516,
    "preview": "export interface Project {\n  id: string;\n  name: string;\n  description: string;\n  createdAt: Date;\n  updatedAt: Date;\n}\n"
  },
  {
    "path": "frontend/types/quickGenerate.ts",
    "chars": 4448,
    "preview": "/**\n * Quick Generate types for Quick Voice Generation API\n */\n\nimport { InferencePhase, ModelDtype, OffloadingConfig } "
  },
  {
    "path": "frontend/types/speaker.ts",
    "chars": 1284,
    "preview": "export interface Speaker {\n  id: string;\n  name: string;\n  voiceFile: string | null;\n  content: string;\n}\n\nexport interf"
  },
  {
    "path": "frontend/types/task.ts",
    "chars": 1777,
    "preview": "/**\n * Unified Task types for the Task API\n */\n\nimport type { Generation } from './generation';\nimport type { TrainingSt"
  },
  {
    "path": "frontend/types/training.ts",
    "chars": 5625,
    "preview": "/**\n * Training types for Fine-tuning Training API\n */\n\n/**\n * Backend training status values\n */\nexport type TrainingSt"
  },
  {
    "path": "pyproject.toml",
    "chars": 1227,
    "preview": "[build-system]\nrequires = [\"setuptools>=61.0\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"vibevoice\"\nver"
  },
  {
    "path": "rebuild.sh",
    "chars": 969,
    "preview": "#!/bin/bash\n# Helper script for rebuilding Docker images with code updates\n\nset -e\n\necho \"🔄 Rebuilding VibeVoice with fr"
  },
  {
    "path": "test_generation_offloading.py",
    "chars": 23624,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nTest script for voice generation with layer offloading support.\n\nThis script tests the comple"
  },
  {
    "path": "test_offloading.py",
    "chars": 9573,
    "preview": "#!/usr/bin/env python3\n\"\"\"\nTest script for layer offloading functionality.\n\nThis script tests the layer offloading imple"
  },
  {
    "path": "tests/test_logging.py",
    "chars": 6690,
    "preview": "\"\"\"\nTest cases for the logger module.\nDemonstrates the flexibility and ease of use.\n\"\"\"\n\nimport logging\nimport tempfile\n"
  },
  {
    "path": "tests/test_lora_network.py",
    "chars": 6515,
    "preview": "import pytest\nimport re\nfrom vibevoice.lora.lora_network import LoRANetwork\n\n\nclass TestLoRANetwork:\n    \"\"\"Test cases f"
  },
  {
    "path": "tests/test_training_service.py",
    "chars": 13021,
    "preview": "\"\"\"\nUnit tests for TrainingService - specifically testing datetime JSON serialization\n\"\"\"\nimport json\nimport pytest\nimpo"
  },
  {
    "path": "tokenizer/tokenizer.json",
    "chars": 6244779,
    "preview": "{\n  \"version\": \"1.0\",\n  \"truncation\": null,\n  \"padding\": null,\n  \"added_tokens\": [\n    {\n      \"id\": 151643,\n      \"cont"
  },
  {
    "path": "tokenizer/tokenizer_config.json",
    "chars": 7228,
    "preview": "{\n  \"add_bos_token\": false,\n  \"add_prefix_space\": false,\n  \"added_tokens_decoder\": {\n    \"151643\": {\n      \"content\": \"<"
  },
  {
    "path": "tokenizer/vocab.json",
    "chars": 2383319,
    "preview": "{\"!\":0,\"\\\"\":1,\"#\":2,\"$\":3,\"%\":4,\"&\":5,\"'\":6,\"(\":7,\")\":8,\"*\":9,\"+\":10,\",\":11,\"-\":12,\".\":13,\"/\":14,\"0\":15,\"1\":16,\"2\":17,\"3"
  },
  {
    "path": "util/LOGGING_README.md",
    "chars": 10491,
    "preview": "# VibeVoice Logging System\n\nA flexible, production-ready logging system that follows software design principles:\n- **Con"
  },
  {
    "path": "util/__init__.py",
    "chars": 112,
    "preview": "import os\n\nvibevoice_root_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))\n"
  },
  {
    "path": "util/float8_scale.py",
    "chars": 9775,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom typing import Union, List\n\nfrom util.vibevoice_norm import QwenRMSNorm\n\n# These"
  },
  {
    "path": "util/logger.py",
    "chars": 9438,
    "preview": "\"\"\"\nLogging configuration for VibeVoice backend.\n\nThis module provides a flexible logging system that follows:\n- Convent"
  },
  {
    "path": "util/logger_examples.py",
    "chars": 1596,
    "preview": "\"\"\"\nSimple examples demonstrating the logger usage.\n\"\"\"\n\nimport sys\nimport os\nsys.path.insert(0, os.path.dirname(os.path"
  },
  {
    "path": "util/model_utils.py",
    "chars": 7427,
    "preview": "import hashlib\nimport safetensors.torch\nimport torch\n\nfrom io import BytesIO\nfrom torch import nn\nfrom util.logger impor"
  },
  {
    "path": "util/rand_init.py",
    "chars": 465,
    "preview": "import torch\n\nfrom typing import Optional\n\n_random_generator: Optional[torch.Generator] = None\n\ndef get_generator(seeds:"
  },
  {
    "path": "util/safetensors_util.py",
    "chars": 10533,
    "preview": "import numpy as np\nimport torch\nimport json\nimport struct\nfrom typing import Dict, Any, Union, Optional\n\nclass Safetenso"
  },
  {
    "path": "util/vibevoice_norm.py",
    "chars": 1784,
    "preview": "import torch\nimport torch.nn as nn\n\nimport typing as tp\n\nclass QwenRMSNorm(nn.Module):\n    def __init__(self, hidden_siz"
  },
  {
    "path": "vibevoice/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/configs/qwen2.5_1.5b_64k.json",
    "chars": 2692,
    "preview": "{\n  \"_attn_implementation_autoset\": true,\n  \"acoustic_vae_dim\": 64,\n  \"acoustic_tokenizer_config\": {\n    \"causal\": true,"
  },
  {
    "path": "vibevoice/configs/qwen2.5_7b_32k.json",
    "chars": 2730,
    "preview": "{\n  \"_attn_implementation_autoset\": true,\n  \"acoustic_vae_dim\": 64,\n  \"acoustic_tokenizer_config\": {\n    \"causal\": true,"
  },
  {
    "path": "vibevoice/generation/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/generation/visitor.py",
    "chars": 1669,
    "preview": "\nfrom abc import ABC, abstractmethod\nfrom typing import List\n\nclass GenerationVisitor(ABC):\n\n    @abstractmethod\n    def"
  },
  {
    "path": "vibevoice/lora/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/lora/lora_network.py",
    "chars": 17235,
    "preview": "import torch\nimport math\nimport re\nimport os\n\nfrom typing import List, Optional, Union, Type, Dict\nfrom torch import nn\n"
  },
  {
    "path": "vibevoice/modular/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/modular/adaptive_offload.py",
    "chars": 14181,
    "preview": "\"\"\"\nAdaptive offload management with VRAM estimation.\n\nThis module provides intelligent VRAM estimation and automatic co"
  },
  {
    "path": "vibevoice/modular/custom_offloading_utils.py",
    "chars": 34072,
    "preview": "\"\"\"\nLayer-wise CPU<->GPU offloading utilities for VRAM optimization.\n\nThis module provides automatic layer offloading fo"
  },
  {
    "path": "vibevoice/modular/modeling_vibevoice.py",
    "chars": 22359,
    "preview": "from dataclasses import dataclass\nfrom typing import Dict, List, Optional, Tuple, Union\nimport torch\nimport torch.nn as "
  },
  {
    "path": "vibevoice/modular/modeling_vibevoice_inference.py",
    "chars": 73668,
    "preview": "import os\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport inspect\n\nfrom dataclasses import dat"
  },
  {
    "path": "vibevoice/modular/modular_vibevoice_diffusion_head.py",
    "chars": 8961,
    "preview": "import math\n\nimport torch\nimport torch.nn as nn\n\nfrom transformers.activations import ACT2FN\nfrom util.float8_scale impo"
  },
  {
    "path": "vibevoice/modular/modular_vibevoice_qwen.py",
    "chars": 24606,
    "preview": "import torch\nimport torch.nn as nn\nfrom typing import Optional, Tuple, Union\n\nfrom transformers.cache_utils import Cache"
  },
  {
    "path": "vibevoice/modular/modular_vibevoice_text_tokenizer.py",
    "chars": 7014,
    "preview": "\"\"\"Tokenization classes for vibevoice.\"\"\"\n\nfrom transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer\nfrom "
  },
  {
    "path": "vibevoice/modular/modular_vibevoice_tokenizer.py",
    "chars": 51303,
    "preview": "import math\nimport typing as tp\nfrom functools import partial\nfrom dataclasses import dataclass\nfrom typing import Optio"
  },
  {
    "path": "vibevoice/modular/streamer.py",
    "chars": 9479,
    "preview": "from __future__ import annotations\n\nimport torch\n\nimport asyncio\nfrom queue import Queue\nfrom typing import TYPE_CHECKIN"
  },
  {
    "path": "vibevoice/processor/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/processor/vibevoice_processor.py",
    "chars": 30142,
    "preview": "import math\nfrom typing import List, Optional, Union, Dict, Any, Tuple\nimport os\nimport re\n\nimport numpy as np\nimport to"
  },
  {
    "path": "vibevoice/processor/vibevoice_tokenizer_processor.py",
    "chars": 17823,
    "preview": "\"\"\"\nProcessor class for VibeVoice models.\n\"\"\"\n\nimport os\nfrom typing import List, Optional, Union, Dict, Any\n\nimport num"
  },
  {
    "path": "vibevoice/schedule/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/schedule/dpm_solver.py",
    "chars": 49945,
    "preview": "# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Versio"
  },
  {
    "path": "vibevoice/schedule/timestep_sampler.py",
    "chars": 737,
    "preview": "import math\nimport torch\n\nclass UniformSampler:\n    def __init__(self, timesteps: int = 1000):\n        self.timesteps = "
  },
  {
    "path": "vibevoice/scripts/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "vibevoice/scripts/convert_nnscaler_checkpoint_to_transformers.py",
    "chars": 6211,
    "preview": "#!/usr/bin/env python\n# coding=utf-8\n\nimport argparse\nimport json\nimport os\nfrom pathlib import Path\nimport re\nimport to"
  },
  {
    "path": "vibevoice/training/dataset.py",
    "chars": 20728,
    "preview": "import math\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List, Optional, Sequence, Tuple, Union\n\nimpo"
  },
  {
    "path": "vibevoice/training/fake_trainer.py",
    "chars": 9219,
    "preview": "import os\n\nfrom typing import Optional\nfrom datetime import datetime\n\nfrom util.logger import get_logger\n\nfrom vibevoice"
  },
  {
    "path": "vibevoice/training/summary_visitor.py",
    "chars": 6241,
    "preview": "import os\nfrom torch.utils.tensorboard import SummaryWriter\n\nfrom vibevoice.training.trainer_visitor import TrainerVisit"
  }
]

// ... and 2 more files (download for full content)

About this extraction

This page contains the full source code of the zhao-kun/VibeVoiceFusion GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 202 files (10.3 MB), approximately 2.7M tokens, and a symbol index with 1324 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!