Repository: zhao-kun/VibeVoiceFusion Branch: main Commit: b3766532d8b0 Files: 202 Total size: 10.3 MB Directory structure: gitextract__vtvi0gi/ ├── .dockerignore ├── .gitignore ├── CHANGELOG.md ├── CHANGELOG_zh.md ├── Dockerfile ├── README.md ├── README_zh.md ├── backend/ │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── api/ │ │ ├── __init__.py │ │ ├── datasets.py │ │ ├── dialog_sessions.py │ │ ├── generation.py │ │ ├── openai_compat.py │ │ ├── preset_voices.py │ │ ├── projects.py │ │ ├── quick_generate.py │ │ ├── speakers.py │ │ ├── tasks.py │ │ └── training.py │ ├── app.py │ ├── config.py │ ├── i18n/ │ │ ├── __init__.py │ │ ├── en.json │ │ └── zh.json │ ├── inference/ │ │ ├── inference.py │ │ └── quick_generate_inference.py │ ├── run.py │ ├── scripts/ │ │ ├── generate_cantonese_training_dataset.py │ │ ├── generate_mcv_cantonese_training_dataset.py │ │ ├── generate_training_dataset.py │ │ └── migrate_dataset_paths.py │ ├── services/ │ │ ├── __init__.py │ │ ├── dataset_service.py │ │ ├── dialog_session_service.py │ │ ├── openai_compat_service.py │ │ ├── preset_voice_service.py │ │ ├── project_service.py │ │ ├── quick_generate_service.py │ │ ├── speaker_service.py │ │ ├── training_service.py │ │ └── voice_gerneration_service.py │ ├── task_manager/ │ │ ├── inference_task.py │ │ ├── quick_generate_task.py │ │ ├── task.py │ │ └── training_task.py │ ├── training/ │ │ ├── engine.py │ │ └── state.py │ └── utils/ │ ├── __init__.py │ ├── dialog_validator.py │ ├── file_handler.py │ └── tensorboard_reader.py ├── compose.yml ├── config/ │ ├── __init__.py │ └── configuration_vibevoice.py ├── demo/ │ ├── README_AUDIO_DENOISE.md │ ├── audio_denoise_deepfilter.py │ ├── audio_denose.py │ ├── convert_model.py │ ├── list_modules.py │ ├── local_file_inference.py │ ├── train.py │ ├── verify_dataset.py │ ├── view_tensorfile.py │ └── vram_offload_animation.py ├── docs/ │ ├── APIs.md │ ├── DATASET_PATH_FIX.md │ ├── DOCKER_REBUILD.md │ ├── develop_thoughts.md │ ├── model_components_analysis.md │ ├── multi-generation-ui-design.md │ ├── offloading.md │ ├── openai-compatible-api.md │ ├── preset-voice-feature.md │ ├── processor.md │ ├── quick-generate-feature.md │ └── vibevoice_inference_architecture.md ├── frontend/ │ ├── .gitignore │ ├── README.md │ ├── app/ │ │ ├── dataset/ │ │ │ ├── detail/ │ │ │ │ └── page.tsx │ │ │ └── page.tsx │ │ ├── fine-tuning/ │ │ │ └── page.tsx │ │ ├── generate-voice/ │ │ │ └── page.tsx │ │ ├── globals.css │ │ ├── layout.tsx │ │ ├── page.tsx │ │ ├── quick-generate/ │ │ │ └── page.tsx │ │ ├── speaker-role/ │ │ │ └── page.tsx │ │ └── voice-editor/ │ │ └── page.tsx │ ├── components/ │ │ ├── AudioPlayer.tsx │ │ ├── AudioUploader.tsx │ │ ├── CreateDatasetModal.tsx │ │ ├── CurrentGeneration.tsx │ │ ├── CurrentTraining.tsx │ │ ├── DatasetCard.tsx │ │ ├── DatasetItemModal.tsx │ │ ├── DatasetItemRow.tsx │ │ ├── DialogEditor.tsx │ │ ├── DialogPreview.tsx │ │ ├── GenerationForm.tsx │ │ ├── GenerationHistory.tsx │ │ ├── ImportDatasetModal.tsx │ │ ├── InlineAudioPlayer.tsx │ │ ├── LayoutWrapper.tsx │ │ ├── Navigation.tsx │ │ ├── PresetVoiceManager.tsx │ │ ├── PresetVoiceSelector.tsx │ │ ├── ProjectSelector.tsx │ │ ├── QuickGenerateHistory.tsx │ │ ├── QuickGenerateNavigation.tsx │ │ ├── SessionManager.tsx │ │ ├── SpeakerList.tsx │ │ ├── SpeakerRoleManager.tsx │ │ ├── SpeakerSelector.tsx │ │ ├── TextEditor.tsx │ │ ├── TrainingForm.tsx │ │ ├── TrainingHistory.tsx │ │ ├── TrainingMetricsChart.tsx │ │ ├── VoicePreview.tsx │ │ └── VoiceRecorder.tsx │ ├── eslint.config.mjs │ ├── lib/ │ │ ├── DatasetContext.tsx │ │ ├── DatasetItemsContext.tsx │ │ ├── GenerationContext.tsx │ │ ├── GlobalTaskContext.tsx │ │ ├── PresetVoiceContext.tsx │ │ ├── ProjectContext.tsx │ │ ├── SessionContext.tsx │ │ ├── SpeakerRoleContext.tsx │ │ ├── TrainingContext.tsx │ │ ├── api.ts │ │ ├── audioUtils.ts │ │ └── i18n/ │ │ ├── LanguageContext.tsx │ │ ├── config.ts │ │ └── locales/ │ │ ├── en.json │ │ └── zh.json │ ├── next.config.ts │ ├── package.json │ ├── postcss.config.mjs │ ├── public/ │ │ ├── icon-preview.html │ │ ├── icon-rect-preview.html │ │ └── site.webmanifest │ ├── scripts/ │ │ └── generate-version.js │ ├── tsconfig.json │ └── types/ │ ├── dialog.ts │ ├── generation.ts │ ├── preset.ts │ ├── project.ts │ ├── quickGenerate.ts │ ├── speaker.ts │ ├── task.ts │ └── training.ts ├── pyproject.toml ├── rebuild.sh ├── test_generation_offloading.py ├── test_offloading.py ├── tests/ │ ├── test_logging.py │ ├── test_lora_network.py │ └── test_training_service.py ├── tokenizer/ │ ├── tokenizer.json │ ├── tokenizer_config.json │ └── vocab.json ├── util/ │ ├── LOGGING_README.md │ ├── __init__.py │ ├── float8_scale.py │ ├── logger.py │ ├── logger_examples.py │ ├── model_utils.py │ ├── rand_init.py │ ├── safetensors_util.py │ └── vibevoice_norm.py └── vibevoice/ ├── __init__.py ├── configs/ │ ├── qwen2.5_1.5b_64k.json │ └── qwen2.5_7b_32k.json ├── generation/ │ ├── __init__.py │ └── visitor.py ├── lora/ │ ├── __init__.py │ └── lora_network.py ├── modular/ │ ├── __init__.py │ ├── adaptive_offload.py │ ├── custom_offloading_utils.py │ ├── modeling_vibevoice.py │ ├── modeling_vibevoice_inference.py │ ├── modular_vibevoice_diffusion_head.py │ ├── modular_vibevoice_qwen.py │ ├── modular_vibevoice_text_tokenizer.py │ ├── modular_vibevoice_tokenizer.py │ └── streamer.py ├── processor/ │ ├── __init__.py │ ├── vibevoice_processor.py │ └── vibevoice_tokenizer_processor.py ├── schedule/ │ ├── __init__.py │ ├── dpm_solver.py │ └── timestep_sampler.py ├── scripts/ │ ├── __init__.py │ └── convert_nnscaler_checkpoint_to_transformers.py └── training/ ├── dataset.py ├── fake_trainer.py ├── summary_visitor.py ├── trainer.py └── trainer_visitor.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # This .dockerignore is mostly irrelevant since we don't COPY from local context # The Dockerfile clones everything from GitHub # However, we keep this to prevent accidentally copying if someone modifies the Dockerfile # Everything - we clone from GitHub instead * # Exception: Allow Dockerfile itself for reference (not actually copied in our Dockerfile) !Dockerfile ================================================ FILE: .gitignore ================================================ .claude CLAUDE.md .vscode/ uv.lock media/ demo/example/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py.cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # UV # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. #uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock #poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. # https://pdm-project.org/en/latest/usage/project/#working-with-version-control #pdm.lock #pdm.toml .pdm-python .pdm-build/ # pixi # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. #pixi.lock # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one # in the .venv directory. It is recommended not to include this directory in version control. .pixi # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .envrc .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Abstra # Abstra is an AI-powered process automation framework. # Ignore directories containing user credentials, local state, and settings. # Learn more at https://abstra.io/docs .abstra/ # Visual Studio Code # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore # and can be added to the global gitignore or merged into this file. However, if you prefer, # you could uncomment the following to ignore the entire vscode folder # .vscode/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Cursor # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore # Marimo marimo/_static/ marimo/_lsp/ __marimo__/ models *.txt *.pt *.swp *.safetensors outputs/ # Backend backend/.env backend/uploads/ backend/__pycache__/ backend/**/__pycache__/ !backend/models/ !frontend/lib/ # Workspace workspace/ outputs_offloading/ demo/datasets/ tensorboard_logs/ nohup.out docs/images/vibevoice_architecture_1.svg ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to VibeVoice will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [v2.2.0] - 2026-01-09 ### Added - **Quick Generation**: Simplified voice generation workflow without project context - One-click generation with preset voice selection - Support for multiple voice prompt files - Multiple prompt voices displayed in generation history detail view - Task indicator for quick generation status - Per-item progress tracking for each generating voice - **Navigation Enhancement**: Click logo to return to home page ### Fixed - Task indicator display bugs in quick generation - Card status not updating in time - Styling consistency for deleting generation history --- ## [v2.1.0] - 2025-12-28 ### Added - **Narration Mode Editor**: New editing mode for single-speaker narration content - Support for changing narrator - Plain text editing without speaker prefixes - **Preset Voices Management**: Manage preset voice samples for quick speaker creation - **Auto Version Generation**: Frontend version automatically generated from git ### Fixed - Offloading config save issue during inference - Offloading config error - Duplicated speaker ID issue - Missing tags in Docker repository - Dockerfile dependency errors --- ## [v2.0.0] - 2025-12-19 ### Added - **Fine-Tuning Support**: Full LoRA training workflow with real-time metrics - Training page with live progress bars and configuration options - Training metrics charts (Loss/LR/Timing) with 5-second auto-refresh - Support for layer offloading presets (Balanced/Aggressive/Extreme) - Gradient accumulation steps and checkpoint saving per epoch - TensorBoard metrics reader for training visualization - **Dataset Management**: Complete dataset CRUD operations - Dataset list and detail pages with pagination - Import/Export functionality for datasets - JSONL format for efficient line-by-line operations - Scripts for generating datasets from Mozilla Common Voice and KeSpeech - **Multi-Generation**: Batch generation with different random seeds - Generate 2-20 audio variations in a single request - Per-item progress tracking with individual audio players - Expandable history view with aggregate statistics - **LoRA Inference**: Apply trained LoRA models during voice generation - Select LoRA model from training output directory - Configurable LoRA weight (0-1] - **Unified Task API**: Single endpoint for checking any running task (inference or training) - **Preset Voice Feature**: Quick speaker creation from preset voice samples - **Audio Denoising**: Scripts for audio denoising with DeepFilter - **Dataset Processing Scripts**: - Script for ASR-SCCantDuSC (Scripted Chinese Cantonese Daily-use Speech Corpus) - Script for Mozilla Common Voice datasets ### Changed - Improved training completion UI with better status display - Enhanced training history list to display all information regardless of success/failure - Better estimated training time calculation - Increased file upload limits (500MB configurable) - Project-scoped current generation and training API endpoints ### Fixed - Training metadata update error - Generated voices having same name in batch generation - Seeds reset issue with multi-generation - CUDA resource cleanup when training finishes or fails - Invalid audio and voice_prompts field values in datasets.jsonl - Delete training history validation - OOM error handling with specific error messages - Various npm build errors and UI style issues ## [v1.0.0] - 2025-11-14 ### Added - **Core TTS Model**: AR + diffusion architecture for multi-speaker text-to-speech synthesis - Float8 inference support for optimized performance - Mono model file inference support - **Full-Stack Web Application**: - Next.js frontend with responsive UI - Flask backend with RESTful API - Static export for production deployment - **Project Management**: Create and manage multiple voice projects - **Speaker Voice Management**: - Upload and manage speaker voices - Voice recording directly in browser - Auto-assigned speaker names ("Speaker 1", etc.) - **Dialog Editor**: 4-panel layout for creating and editing dialog sessions - Clickable session names in generation history - Session navigation to voice editor - **Voice Generation**: - Live progress monitoring - Generation history with pagination - Audio playback and download - Task icon notification in navigation - **Layer Offloading**: VRAM optimization for GPU memory constraints - Configurable number of layers for GPU/CPU - Async transfers with ThreadPoolExecutor - Smart cache clearing for performance - **Internationalization (i18n)**: Full bilingual support - English and Chinese languages - Auto-detection via browser settings - Persistence in localStorage - **Docker Support**: - Dockerfile for containerized deployment - GPU support with nvidia-docker - **Documentation**: - Comprehensive API documentation - Architecture diagrams with Mermaid - Offloading configuration guide ### Fixed - Invisible text color in browser dark theme - Frontend project selection issues - Refresh page navigation bugs - Layout issues in various components - Scripts not starting with ID 1 - Various typos and documentation errors [v2.2.0]: https://github.com/zhao-kun/vibevoice/compare/v2.1.0...v2.2.0 [v2.1.0]: https://github.com/zhao-kun/vibevoice/compare/v2.0.0...v2.1.0 [v2.0.0]: https://github.com/zhao-kun/vibevoice/compare/v1.0.0...v2.0.0 [v1.0.0]: https://github.com/zhao-kun/vibevoice/releases/tag/v1.0.0 ================================================ FILE: CHANGELOG_zh.md ================================================ # 更新日志 VibeVoice 的所有重要更改都将记录在此文件中。 本文档格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/), 并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。 ## [v2.2.0] - 2026-01-09 ### 新增 - **快速生成**: 无需项目上下文的简化语音生成流程 - 一键生成,支持预设音色选择 - 支持多个音色提示文件 - 生成历史详情中显示多个提示音色 - 快速生成任务状态指示器 - 每个生成语音的独立进度跟踪 - **导航增强**: 点击 Logo 返回首页 ### 修复 - 快速生成中的任务指示器显示问题 - 卡片状态未及时更新 - 删除生成历史的样式一致性问题 --- ## [v2.1.0] - 2025-12-28 ### 新增 - **旁白模式编辑器**: 单人朗读内容的新编辑模式 - 支持切换朗读者 - 无需说话人前缀的纯文本编辑 - **预设音色管理**: 管理预设音色样本,快速创建说话人 - **自动版本生成**: 前端版本号自动从 git 生成 ### 修复 - 推理过程中的 Offloading 配置保存问题 - Offloading 配置错误 - 说话人 ID 重复问题 - Docker 仓库缺少标签 - Dockerfile 依赖错误 --- ## [v2.0.0] - 2025-12-19 ### 新增 - **微调支持**: 完整的 LoRA 训练工作流,支持实时指标监控 - 训练页面,带有实时进度条和配置选项 - 训练指标图表(Loss/LR/Timing),5 秒自动刷新 - 支持层卸载预设(均衡/激进/极限) - 梯度累积步数和每轮检查点保存 - TensorBoard 指标读取器用于训练可视化 - **数据集管理**: 完整的数据集 CRUD 操作 - 数据集列表和详情页面,支持分页 - 数据集导入/导出功能 - JSONL 格式,高效逐行操作 - 从 Mozilla Common Voice 和 KeSpeech 生成数据集的脚本 - **批量生成**: 使用不同随机种子批量生成 - 单次请求生成 2-20 个音频变体 - 每个项目的进度跟踪和独立音频播放器 - 可展开的历史视图,显示汇总统计 - **LoRA 推理**: 在语音生成时应用训练好的 LoRA 模型 - 从训练输出目录选择 LoRA 模型 - 可配置的 LoRA 权重 (0-1] - **统一任务 API**: 单一接口检查任何运行中的任务(推理或训练) - **预设音色功能**: 从预设音色样本快速创建说话人 - **音频降噪**: 使用 DeepFilter 的音频降噪脚本 - **数据集处理脚本**: - ASR-SCCantDuSC(粤语日常用语语音语料库)处理脚本 - Mozilla Common Voice 数据集处理脚本 ### 变更 - 改进训练完成 UI,更好的状态显示 - 增强训练历史列表,无论成功或失败都显示所有信息 - 更准确的预估训练时间计算 - 增加文件上传限制(500MB,可配置) - 项目范围的当前生成和训练 API 端点 ### 修复 - 训练元数据更新错误 - 批量生成中生成的语音名称相同 - 批量生成时种子重置问题 - 训练完成或失败时的 CUDA 资源清理 - datasets.jsonl 中无效的音频和 voice_prompts 字段值 - 删除训练历史的验证问题 - OOM 错误处理,提供具体错误信息 - 各种 npm 构建错误和 UI 样式问题 ## [v1.0.0] - 2025-11-14 ### 新增 - **核心 TTS 模型**: AR + 扩散架构的多说话人文本转语音合成 - Float8 推理支持,优化性能 - 单体模型文件推理支持 - **全栈 Web 应用**: - Next.js 前端,响应式 UI - Flask 后端,RESTful API - 静态导出用于生产部署 - **项目管理**: 创建和管理多个语音项目 - **说话人音色管理**: - 上传和管理说话人音色 - 浏览器内直接录音 - 自动分配说话人名称("说话人 1" 等) - **对话编辑器**: 四面板布局,创建和编辑对话会话 - 生成历史中可点击的会话名称 - 会话导航到语音编辑器 - **语音生成**: - 实时进度监控 - 生成历史,支持分页 - 音频播放和下载 - 导航栏任务图标通知 - **层卸载**: GPU 显存优化 - 可配置的 GPU/CPU 层数 - ThreadPoolExecutor 异步传输 - 智能缓存清理以提升性能 - **国际化 (i18n)**: 完整的双语支持 - 英语和中文 - 通过浏览器设置自动检测 - localStorage 持久化 - **Docker 支持**: - Dockerfile 容器化部署 - nvidia-docker GPU 支持 - **文档**: - 完整的 API 文档 - Mermaid 架构图 - 层卸载配置指南 ### 修复 - 浏览器深色主题中不可见的文字颜色 - 前端项目选择问题 - 页面刷新导航问题 - 各组件布局问题 - 脚本未从 ID 1 开始 - 各种拼写错误和文档错误 [v2.2.0]: https://github.com/zhao-kun/vibevoice/compare/v2.1.0...v2.2.0 [v2.1.0]: https://github.com/zhao-kun/vibevoice/compare/v2.0.0...v2.1.0 [v2.0.0]: https://github.com/zhao-kun/vibevoice/compare/v1.0.0...v2.0.0 [v1.0.0]: https://github.com/zhao-kun/vibevoice/releases/tag/v1.0.0 ================================================ FILE: Dockerfile ================================================ # Multi-stage Dockerfile for VibeVoice # This Dockerfile is completely self-contained and requires no local source code # All source code is cloned from GitHub during build ARG BASE_IMAGE=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 ARG WORKDIR=/workspace/zhao-kun/vibevoice ARG GITHUB_REPO=https://github.com/zhao-kun/vibevoice.git ARG GITHUB_BRANCH=main ############################################# # Stage 1: Download model from HuggingFace ############################################# FROM python:3.10-slim AS model-downloader # Install huggingface-cli RUN pip install --no-cache-dir huggingface-hub[cli] # Set working directory WORKDIR /tmp/models # Download model (float8_e4m3fn only) using huggingface-cli RUN hf download zhaokun/vibevoice-large \ vibevoice7b_float8_e4m3fn.safetensors \ --local-dir /tmp/models/VibeVoice-large RUN hf download zhaokun/vibevoice-large \ vibevoice7b_bf16.safetensors \ --local-dir /tmp/models/VibeVoice-large ############################################# # Stage 2: Clone Repository and Build Frontend ############################################# FROM node:20-alpine AS source-and-frontend ARG GITHUB_REPO ARG GITHUB_BRANCH ARG CACHE_BUST=unknown # Install git RUN apk add --no-cache git # Cache bust: Force rebuild from here when CACHE_BUST changes RUN echo "Cache bust: ${CACHE_BUST}" # Clone repository (shallow clone, then unshallow to get full history for git describe) WORKDIR /build RUN git clone --depth 1 --branch ${GITHUB_BRANCH} ${GITHUB_REPO} vibevoice && \ cd /build/vibevoice && \ git fetch --unshallow origin && \ git fetch --tags origin && \ git checkout main && \ git rev-parse HEAD > backend/version.txt # Build frontend WORKDIR /build/vibevoice/frontend # Install dependencies RUN npm ci # Build frontend RUN npm run build # Verify build output RUN ls -la out/ ############################################# # Stage 3: Create Python Virtual Environment ############################################# FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS python-builder ARG WORKDIR=/workspace/zhao-kun/vibevoice ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update --allow-releaseinfo-change --yes && \ apt-get upgrade --yes && \ apt install --yes --no-install-recommends \ bash \ libgl1 \ software-properties-common \ ffmpeg \ zip \ unzip \ iputils-ping \ libtcmalloc-minimal4 \ net-tools \ vim \ p7zip-full && \ rm -rf /var/lib/apt/lists/* RUN add-apt-repository ppa:deadsnakes/ppa RUN apt-get update --allow-releaseinfo-change --yes && \ apt install python3.10-dev python3.10-venv python3-pip \ build-essential git curl -y --no-install-recommends && \ ln -s /usr/bin/python3.10 /usr/bin/python && \ rm /usr/bin/python3 && \ ln -s /usr/bin/python3.10 /usr/bin/python3 && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ echo "en_US.UTF-8 UTF-8" > /etc/locale.gen # Create working directory at EXACT runtime path RUN mkdir -p ${WORKDIR} WORKDIR ${WORKDIR} # Copy source code from frontend stage COPY --from=source-and-frontend /build/vibevoice . # Create virtual environment at runtime path (critical for absolute paths in venv) RUN python3.10 -m venv ${WORKDIR}/venv # Upgrade pip and install dependencies RUN ${WORKDIR}/venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \ ${WORKDIR}/venv/bin/pip install --no-cache-dir . RUN rm -rf ${WORKDIR}/frontend ############################################# # Stage 4: Final Image ############################################# FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder ARG WORKDIR=/workspace/zhao-kun/vibevoice ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 RUN apt-get update --allow-releaseinfo-change --yes && \ apt-get upgrade --yes && \ apt install --yes --no-install-recommends \ bash \ libgl1 \ software-properties-common \ ffmpeg \ zip \ unzip \ iputils-ping \ libtcmalloc-minimal4 \ net-tools \ vim \ p7zip-full && \ rm -rf /var/lib/apt/lists/* RUN add-apt-repository ppa:deadsnakes/ppa RUN apt-get update --allow-releaseinfo-change --yes && \ apt install python3.10-dev python3.10-venv python3-pip \ build-essential git curl -y --no-install-recommends && \ ln -s /usr/bin/python3.10 /usr/bin/python && \ rm /usr/bin/python3 && \ ln -s /usr/bin/python3.10 /usr/bin/python3 && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ echo "en_US.UTF-8 UTF-8" > /etc/locale.gen # Copy downloaded model from model-downloader stage RUN mkdir -p /tmp/models/ COPY --from=model-downloader /tmp/models/VibeVoice-large /tmp/models # Create working directory at EXACT same path as build stage RUN mkdir -p ${WORKDIR} WORKDIR ${WORKDIR} # Copy virtual environment from python-builder stage (with preserved absolute paths) COPY --from=python-builder ${WORKDIR} . RUN mkdir -p ${WORKDIR}/models/vibevoice/ && ln -s /tmp/models/vibevoice7b_float8_e4m3fn.safetensors ${WORKDIR}/models/vibevoice/ RUN mkdir -p ${WORKDIR}/models/vibevoice/ && ln -s /tmp/models/vibevoice7b_bf16.safetensors ${WORKDIR}/models/vibevoice/ # Copy frontend build from source-and-frontend stage RUN mkdir -p ${WORKDIR}/backend/dist COPY --from=source-and-frontend /build/vibevoice/frontend/out ${WORKDIR}/backend/dist # Create workspace directory for runtime data RUN mkdir -p ${WORKDIR}/workspace # Expose port EXPOSE 9527 # Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD curl -f http://localhost:9527/health || exit 1 # Use venv python explicitly (critical - do not rely on PATH) CMD ["/workspace/zhao-kun/vibevoice/venv/bin/python", "backend/run.py"] ================================================ FILE: README.md ================================================ # VibeVoiceFusion
Create and manage projects from the home page
Upload and manage voice samples for each speaker
Multi-speaker dialog editor with visual and text modes
Generation interface with parameters, live progress, and history
Generation interface with parameters, live progress, and history
从主页创建和管理项目
为每个说话人上传和管理音色样本
支持可视化和文本模式的多说话人对话编辑器
带有参数配置、实时进度和历史记录的生成界面
带有参数配置、实时进度和历史记录的生成界面