Repository: jamiepine/voicebox Branch: main Commit: d70b878b71d4 Files: 453 Total size: 3.3 MB Directory structure: gitextract_61mm52up/ ├── .agents/ │ └── skills/ │ ├── add-tts-engine/ │ │ └── SKILL.md │ ├── draft-release-notes/ │ │ └── SKILL.md │ └── release-bump/ │ └── SKILL.md ├── .biomeignore ├── .bumpversion.cfg ├── .dockerignore ├── .github/ │ └── workflows/ │ ├── build-windows.yml │ └── release.yml ├── .gitignore ├── .npmrc ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── app/ │ ├── components.json │ ├── index.html │ ├── package.json │ ├── plugins/ │ │ └── changelog.ts │ ├── src/ │ │ ├── App.tsx │ │ ├── components/ │ │ │ ├── AppFrame/ │ │ │ │ └── AppFrame.tsx │ │ │ ├── AudioPlayer/ │ │ │ │ └── AudioPlayer.tsx │ │ │ ├── AudioStudio/ │ │ │ │ └── .gitkeep │ │ │ ├── AudioTab/ │ │ │ │ └── AudioTab.tsx │ │ │ ├── Effects/ │ │ │ │ ├── EffectsChainEditor.tsx │ │ │ │ └── GenerationPicker.tsx │ │ │ ├── EffectsTab/ │ │ │ │ ├── EffectsDetail.tsx │ │ │ │ ├── EffectsList.tsx │ │ │ │ └── EffectsTab.tsx │ │ │ ├── Generation/ │ │ │ │ ├── EngineModelSelector.tsx │ │ │ │ ├── FloatingGenerateBox.tsx │ │ │ │ ├── GenerationForm.tsx │ │ │ │ └── ParalinguisticInput.tsx │ │ │ ├── History/ │ │ │ │ └── HistoryTable.tsx │ │ │ ├── MainEditor/ │ │ │ │ └── MainEditor.tsx │ │ │ ├── ModelsTab/ │ │ │ │ └── ModelsTab.tsx │ │ │ ├── ServerSettings/ │ │ │ │ ├── ConnectionForm.tsx │ │ │ │ ├── GenerationSettings.tsx │ │ │ │ ├── GpuAcceleration.tsx │ │ │ │ ├── ModelManagement.tsx │ │ │ │ ├── ModelProgress.tsx │ │ │ │ ├── ServerStatus.tsx │ │ │ │ └── UpdateStatus.tsx │ │ │ ├── ServerTab/ │ │ │ │ ├── AboutPage.tsx │ │ │ │ ├── ChangelogPage.tsx │ │ │ │ ├── GeneralPage.tsx │ │ │ │ ├── GenerationPage.tsx │ │ │ │ ├── GpuPage.tsx │ │ │ │ ├── LogsPage.tsx │ │ │ │ ├── ServerTab.tsx │ │ │ │ └── SettingRow.tsx │ │ │ ├── ShinyText.tsx │ │ │ ├── Sidebar.tsx │ │ │ ├── StoriesTab/ │ │ │ │ ├── StoriesTab.tsx │ │ │ │ ├── StoryChatItem.tsx │ │ │ │ ├── StoryContent.tsx │ │ │ │ ├── StoryList.tsx │ │ │ │ └── StoryTrackEditor.tsx │ │ │ ├── TitleBarDragRegion.tsx │ │ │ ├── VoiceProfiles/ │ │ │ │ ├── AudioSampleRecording.tsx │ │ │ │ ├── AudioSampleSystem.tsx │ │ │ │ ├── AudioSampleUpload.tsx │ │ │ │ ├── ProfileCard.tsx │ │ │ │ ├── ProfileForm.tsx │ │ │ │ ├── ProfileList.tsx │ │ │ │ ├── SampleList.tsx │ │ │ │ └── SampleUpload.tsx │ │ │ ├── VoicesTab/ │ │ │ │ ├── VoiceInspector.tsx │ │ │ │ └── VoicesTab.tsx │ │ │ └── ui/ │ │ │ ├── alert-dialog.tsx │ │ │ ├── badge.tsx │ │ │ ├── button.tsx │ │ │ ├── card.tsx │ │ │ ├── checkbox.tsx │ │ │ ├── circle-button.tsx │ │ │ ├── dialog.tsx │ │ │ ├── dropdown-menu.tsx │ │ │ ├── form.tsx │ │ │ ├── input.tsx │ │ │ ├── label.tsx │ │ │ ├── multi-select.tsx │ │ │ ├── popover.tsx │ │ │ ├── progress.tsx │ │ │ ├── select.tsx │ │ │ ├── separator.tsx │ │ │ ├── slider.tsx │ │ │ ├── table.tsx │ │ │ ├── tabs.tsx │ │ │ ├── textarea.tsx │ │ │ ├── toast.tsx │ │ │ ├── toaster.tsx │ │ │ ├── toggle.tsx │ │ │ └── use-toast.ts │ │ ├── global.d.ts │ │ ├── hooks/ │ │ │ ├── useAutoUpdater.ts │ │ │ └── useAutoUpdater.tsx │ │ ├── index.css │ │ ├── lib/ │ │ │ ├── api/ │ │ │ │ ├── .gitkeep │ │ │ │ ├── client.ts │ │ │ │ ├── core/ │ │ │ │ │ ├── ApiError.ts │ │ │ │ │ ├── ApiRequestOptions.ts │ │ │ │ │ ├── ApiResult.ts │ │ │ │ │ ├── CancelablePromise.ts │ │ │ │ │ ├── OpenAPI.ts │ │ │ │ │ └── request.ts │ │ │ │ ├── index.ts │ │ │ │ ├── models/ │ │ │ │ │ ├── Body_add_profile_sample_profiles__profile_id__samples_post.ts │ │ │ │ │ ├── Body_transcribe_audio_transcribe_post.ts │ │ │ │ │ ├── GenerationRequest.ts │ │ │ │ │ ├── GenerationResponse.ts │ │ │ │ │ ├── HTTPValidationError.ts │ │ │ │ │ ├── HealthResponse.ts │ │ │ │ │ ├── HistoryListResponse.ts │ │ │ │ │ ├── HistoryResponse.ts │ │ │ │ │ ├── ModelDownloadRequest.ts │ │ │ │ │ ├── ModelStatus.ts │ │ │ │ │ ├── ModelStatusListResponse.ts │ │ │ │ │ ├── ProfileSampleResponse.ts │ │ │ │ │ ├── TranscriptionResponse.ts │ │ │ │ │ ├── ValidationError.ts │ │ │ │ │ ├── VoiceProfileCreate.ts │ │ │ │ │ └── VoiceProfileResponse.ts │ │ │ │ ├── schemas/ │ │ │ │ │ ├── $Body_add_profile_sample_profiles__profile_id__samples_post.ts │ │ │ │ │ ├── $Body_transcribe_audio_transcribe_post.ts │ │ │ │ │ ├── $GenerationRequest.ts │ │ │ │ │ ├── $GenerationResponse.ts │ │ │ │ │ ├── $HTTPValidationError.ts │ │ │ │ │ ├── $HealthResponse.ts │ │ │ │ │ ├── $HistoryListResponse.ts │ │ │ │ │ ├── $HistoryResponse.ts │ │ │ │ │ ├── $ModelDownloadRequest.ts │ │ │ │ │ ├── $ModelStatus.ts │ │ │ │ │ ├── $ModelStatusListResponse.ts │ │ │ │ │ ├── $ProfileSampleResponse.ts │ │ │ │ │ ├── $TranscriptionResponse.ts │ │ │ │ │ ├── $ValidationError.ts │ │ │ │ │ ├── $VoiceProfileCreate.ts │ │ │ │ │ └── $VoiceProfileResponse.ts │ │ │ │ ├── services/ │ │ │ │ │ └── DefaultService.ts │ │ │ │ └── types.ts │ │ │ ├── constants/ │ │ │ │ ├── languages.ts │ │ │ │ └── ui.ts │ │ │ ├── hooks/ │ │ │ │ ├── useAudioPlayer.ts │ │ │ │ ├── useAudioRecording.ts │ │ │ │ ├── useGeneration.ts │ │ │ │ ├── useGenerationForm.ts │ │ │ │ ├── useGenerationProgress.ts │ │ │ │ ├── useHistory.ts │ │ │ │ ├── useModelDownloadToast.tsx │ │ │ │ ├── useProfiles.ts │ │ │ │ ├── useRestoreActiveTasks.tsx │ │ │ │ ├── useServer.ts │ │ │ │ ├── useStories.ts │ │ │ │ ├── useStoryPlayback.ts │ │ │ │ ├── useSystemAudioCapture.ts │ │ │ │ └── useTranscription.ts │ │ │ └── utils/ │ │ │ ├── .gitkeep │ │ │ ├── audio.ts │ │ │ ├── cn.ts │ │ │ ├── debug.ts │ │ │ ├── format.ts │ │ │ └── parseChangelog.ts │ │ ├── main.tsx │ │ ├── platform/ │ │ │ ├── PlatformContext.tsx │ │ │ └── types.ts │ │ ├── router.tsx │ │ ├── stores/ │ │ │ ├── audioChannelStore.ts │ │ │ ├── effectsStore.ts │ │ │ ├── generationStore.ts │ │ │ ├── logStore.ts │ │ │ ├── playerStore.ts │ │ │ ├── serverStore.ts │ │ │ ├── storyStore.ts │ │ │ └── uiStore.ts │ │ └── types/ │ │ └── index.ts │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts ├── backend/ │ ├── README.md │ ├── STYLE_GUIDE.md │ ├── __init__.py │ ├── app.py │ ├── backends/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── chatterbox_backend.py │ │ ├── chatterbox_turbo_backend.py │ │ ├── hume_backend.py │ │ ├── kokoro_backend.py │ │ ├── luxtts_backend.py │ │ ├── mlx_backend.py │ │ └── pytorch_backend.py │ ├── build_binary.py │ ├── config.py │ ├── database/ │ │ ├── __init__.py │ │ ├── migrations.py │ │ ├── models.py │ │ ├── seed.py │ │ └── session.py │ ├── main.py │ ├── models.py │ ├── pyproject.toml │ ├── requirements-mlx.txt │ ├── requirements.txt │ ├── routes/ │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── channels.py │ │ ├── cuda.py │ │ ├── effects.py │ │ ├── generations.py │ │ ├── health.py │ │ ├── history.py │ │ ├── models.py │ │ ├── profiles.py │ │ ├── stories.py │ │ ├── tasks.py │ │ └── transcription.py │ ├── server.py │ ├── services/ │ │ ├── __init__.py │ │ ├── channels.py │ │ ├── cuda.py │ │ ├── effects.py │ │ ├── export_import.py │ │ ├── generation.py │ │ ├── history.py │ │ ├── profiles.py │ │ ├── stories.py │ │ ├── task_queue.py │ │ ├── transcribe.py │ │ ├── tts.py │ │ └── versions.py │ ├── tests/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── test_cors.py │ │ ├── test_generation_download.py │ │ ├── test_profile_duplicate_names.py │ │ ├── test_progress.py │ │ ├── test_qwen_download.py │ │ └── test_whisper_download.py │ └── utils/ │ ├── __init__.py │ ├── audio.py │ ├── cache.py │ ├── chunked_tts.py │ ├── dac_shim.py │ ├── effects.py │ ├── hf_offline_patch.py │ ├── hf_progress.py │ ├── images.py │ ├── platform_detect.py │ ├── progress.py │ └── tasks.py ├── biome.json ├── data/ │ └── .gitkeep ├── docker-compose.yml ├── docs/ │ ├── .gitignore │ ├── README.md │ ├── app/ │ │ ├── [[...slug]]/ │ │ │ ├── layout.tsx │ │ │ └── page.tsx │ │ ├── api/ │ │ │ └── search/ │ │ │ └── route.ts │ │ ├── global.css │ │ ├── layout.tsx │ │ ├── llms-full.txt/ │ │ │ └── route.ts │ │ ├── llms.mdx/ │ │ │ └── docs/ │ │ │ └── [[...slug]]/ │ │ │ └── route.ts │ │ └── og/ │ │ └── docs/ │ │ └── [...slug]/ │ │ └── route.tsx │ ├── cli.json │ ├── components/ │ │ ├── ai/ │ │ │ └── page-actions.tsx │ │ ├── api-page.client.tsx │ │ ├── api-page.tsx │ │ └── ui/ │ │ ├── button.tsx │ │ └── popover.tsx │ ├── content/ │ │ └── docs/ │ │ ├── README.md │ │ ├── TROUBLESHOOTING.md │ │ ├── api-reference/ │ │ │ ├── general/ │ │ │ │ ├── health_health_get.mdx │ │ │ │ ├── meta.json │ │ │ │ └── root__get.mdx │ │ │ ├── generation/ │ │ │ │ ├── generate_speech_generate_post.mdx │ │ │ │ ├── get_audio_audio__generation_id__get.mdx │ │ │ │ ├── meta.json │ │ │ │ └── transcribe_audio_transcribe_post.mdx │ │ │ ├── history/ │ │ │ │ ├── delete_generation_history__generation_id__delete.mdx │ │ │ │ ├── get_generation_history__generation_id__get.mdx │ │ │ │ ├── get_stats_history_stats_get.mdx │ │ │ │ ├── list_history_history_get.mdx │ │ │ │ └── meta.json │ │ │ ├── meta.json │ │ │ ├── models/ │ │ │ │ ├── get_model_progress_models_progress__model_name__get.mdx │ │ │ │ ├── get_model_status_models_status_get.mdx │ │ │ │ ├── load_model_models_load_post.mdx │ │ │ │ ├── meta.json │ │ │ │ ├── trigger_model_download_models_download_post.mdx │ │ │ │ └── unload_model_models_unload_post.mdx │ │ │ └── profiles/ │ │ │ ├── add_profile_sample_profiles__profile_id__samples_post.mdx │ │ │ ├── create_profile_profiles_post.mdx │ │ │ ├── delete_profile_profiles__profile_id__delete.mdx │ │ │ ├── delete_profile_sample_profiles_samples__sample_id__delete.mdx │ │ │ ├── get_profile_profiles__profile_id__get.mdx │ │ │ ├── get_profile_samples_profiles__profile_id__samples_get.mdx │ │ │ ├── list_profiles_profiles_get.mdx │ │ │ ├── meta.json │ │ │ └── update_profile_profiles__profile_id__put.mdx │ │ ├── developer/ │ │ │ ├── architecture.mdx │ │ │ ├── audio-channels.mdx │ │ │ ├── autoupdater.mdx │ │ │ ├── building.mdx │ │ │ ├── contributing.mdx │ │ │ ├── effects-pipeline.mdx │ │ │ ├── history.mdx │ │ │ ├── meta.json │ │ │ ├── model-management.mdx │ │ │ ├── setup.mdx │ │ │ ├── stories.mdx │ │ │ ├── transcription.mdx │ │ │ ├── tts-engines.mdx │ │ │ ├── tts-generation.mdx │ │ │ └── voice-profiles.mdx │ │ ├── index.mdx │ │ ├── meta.json │ │ └── overview/ │ │ ├── building-stories.mdx │ │ ├── creating-voice-profiles.mdx │ │ ├── docker.mdx │ │ ├── generating-speech.mdx │ │ ├── generation-history.mdx │ │ ├── installation.mdx │ │ ├── introduction.mdx │ │ ├── meta.json │ │ ├── quick-start.mdx │ │ ├── recording-transcription.mdx │ │ ├── remote-mode.mdx │ │ ├── stories-editor.mdx │ │ ├── troubleshooting.mdx │ │ └── voice-cloning.mdx │ ├── lib/ │ │ ├── cn.ts │ │ ├── layout.shared.tsx │ │ ├── openapi.ts │ │ └── source.ts │ ├── mdx-components.tsx │ ├── next.config.mjs │ ├── notes/ │ │ ├── BACKEND_CODE_REVIEW.md │ │ ├── MIGRATION.md │ │ ├── PROJECT_STATUS.md │ │ ├── RELEASE_v0.2.0.md │ │ └── issue-pain-points.md │ ├── openapi.json │ ├── package.json │ ├── plans/ │ │ ├── API_REFACTOR_PLAN.md │ │ ├── CUDA_LIBS_ADDON.md │ │ ├── DOCKER_DEPLOYMENT.md │ │ └── OPENAI_SUPPORT.md │ ├── postcss.config.mjs │ ├── scripts/ │ │ └── generate-openapi.ts │ ├── source.config.ts │ └── tsconfig.json ├── justfile ├── landing/ │ ├── .gitignore │ ├── README.md │ ├── components.json │ ├── next.config.js │ ├── nixpacks.toml │ ├── package.json │ ├── postcss.config.js │ ├── public/ │ │ ├── audio/ │ │ │ ├── fireship.webm │ │ │ ├── jarvis.webm │ │ │ ├── linus.webm │ │ │ ├── morganfreeman.webm │ │ │ ├── samaltman.webm │ │ │ └── samjackson.webm │ │ └── voicebox-demo.webm │ ├── src/ │ │ ├── app/ │ │ │ ├── api/ │ │ │ │ ├── releases/ │ │ │ │ │ └── route.ts │ │ │ │ └── stars/ │ │ │ │ └── route.ts │ │ │ ├── download/ │ │ │ │ └── [platform]/ │ │ │ │ └── route.ts │ │ │ ├── globals.css │ │ │ ├── layout.tsx │ │ │ ├── linux-install/ │ │ │ │ └── page.tsx │ │ │ ├── og/ │ │ │ │ └── page.tsx │ │ │ └── page.tsx │ │ ├── components/ │ │ │ ├── Banner.tsx │ │ │ ├── ControlUI.tsx │ │ │ ├── DownloadSection.tsx │ │ │ ├── Features.tsx │ │ │ ├── Footer.tsx │ │ │ ├── Header.tsx │ │ │ ├── LandingAudioPlayer.tsx │ │ │ ├── Navbar.tsx │ │ │ ├── PlatformIcons.tsx │ │ │ ├── VoiceCreator.tsx │ │ │ └── ui/ │ │ │ ├── button.tsx │ │ │ ├── card.tsx │ │ │ ├── feature-card.tsx │ │ │ ├── hero.tsx │ │ │ ├── section.tsx │ │ │ └── separator.tsx │ │ └── lib/ │ │ ├── constants.ts │ │ ├── releases.ts │ │ └── utils.ts │ ├── tailwind.config.js │ └── tsconfig.json ├── package.json ├── requirements.txt ├── scripts/ │ ├── build-server.sh │ ├── convert-assets.sh │ ├── generate-api.sh │ ├── package_cuda.py │ ├── prepare-release.sh │ ├── setup-dev-sidecar.js │ ├── test_download_progress.py │ └── update-icons.sh ├── tauri/ │ ├── assets/ │ │ └── voicebox.icon/ │ │ └── icon.json │ ├── index.html │ ├── package.json │ ├── src/ │ │ ├── main.tsx │ │ └── platform/ │ │ ├── audio.ts │ │ ├── filesystem.ts │ │ ├── index.ts │ │ ├── lifecycle.ts │ │ ├── metadata.ts │ │ └── updater.ts │ ├── src-tauri/ │ │ ├── Cargo.toml │ │ ├── Entitlements.plist │ │ ├── Info.plist │ │ ├── build.rs │ │ ├── capabilities/ │ │ │ └── default.json │ │ ├── gen/ │ │ │ └── schemas/ │ │ │ ├── acl-manifests.json │ │ │ ├── capabilities.json │ │ │ ├── desktop-schema.json │ │ │ ├── macOS-schema.json │ │ │ └── windows-schema.json │ │ ├── icons/ │ │ │ ├── android/ │ │ │ │ ├── mipmap-anydpi-v26/ │ │ │ │ │ └── ic_launcher.xml │ │ │ │ └── values/ │ │ │ │ └── ic_launcher_background.xml │ │ │ └── icon.icns │ │ ├── src/ │ │ │ ├── audio_capture/ │ │ │ │ ├── linux.rs │ │ │ │ ├── macos.rs │ │ │ │ ├── mod.rs │ │ │ │ └── windows.rs │ │ │ ├── audio_output.rs │ │ │ ├── lib.rs │ │ │ └── main.rs │ │ ├── tauri.conf.json │ │ └── tests/ │ │ └── audio_capture_test.rs │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts └── web/ ├── index.html ├── package.json ├── src/ │ ├── main.tsx │ └── platform/ │ ├── audio.ts │ ├── filesystem.ts │ ├── index.ts │ ├── lifecycle.ts │ ├── metadata.ts │ └── updater.ts ├── tsconfig.json ├── tsconfig.node.json └── vite.config.ts ================================================ FILE CONTENTS ================================================ ================================================ FILE: .agents/skills/add-tts-engine/SKILL.md ================================================ --- name: add-tts-engine description: Use this skill to add a new TTS engine to Voicebox. It walks through dependency research, backend implementation, frontend wiring, PyInstaller bundling, and frozen-build testing. Always start with Phase 0 (dependency audit) before writing any code. --- # Add TTS Engine ## Goal Integrate a new text-to-speech engine into Voicebox end-to-end: dependency research, backend protocol implementation, frontend UI wiring, PyInstaller bundling, and frozen-build verification. The user should only need to test the final build locally. ## Reference Doc The full phased guide lives at `docs/content/docs/developer/tts-engines.mdx`. **Read this file in its entirety before starting.** It contains: - Phase 0: Dependency research (mandatory before writing code) - Phase 1: Backend implementation (`TTSBackend` protocol) - Phase 2: Route and service integration (usually zero changes) - Phase 3: Frontend integration (5 files) - Phase 4: Dependencies (`requirements.txt`, justfile, CI, Docker) - Phase 5: PyInstaller bundling (`build_binary.py` + `server.py`) - Phase 6: Common upstream workarounds - Implementation checklist (gate between phases) ## Workflow ### 1. Read the guide ```bash # Read the full TTS engines doc cat docs/content/docs/developer/tts-engines.mdx ``` Internalize all phases, especially Phase 0 and Phase 5. The v0.2.3 release was three patch releases because Phase 0 was skipped. ### 2. Dependency research (Phase 0) Clone the model library into a temporary directory and audit it. Do NOT skip this. ```bash mkdir /tmp/engine-research && cd /tmp/engine-research git clone ``` Run the grep searches from Phase 0.2 in the guide against the cloned source and its transitive dependencies. Produce a written dependency audit covering: 1. PyPI vs non-PyPI packages 2. PyInstaller directives needed (`--collect-all`, `--copy-metadata`, `--hidden-import`) 3. Runtime data files that must be bundled 4. Native library paths that need env var overrides in frozen builds 5. Monkey-patches needed (`torch.load`, float64, MPS, HF token) 6. Sample rate 7. Model download method (`from_pretrained` vs `snapshot_download` + `from_local`) Test model loading and generation on CPU in the throwaway venv before proceeding. ### 3. Implement (Phases 1–4) Follow the guide's phases in order. Key files to modify: **Backend (Phase 1):** - Create `backend/backends/_backend.py` - Register in `backend/backends/__init__.py` (ModelConfig + TTS_ENGINES + factory) - Update regex in `backend/models.py` **Frontend (Phase 3):** - `app/src/lib/api/types.ts` — engine union type - `app/src/lib/constants/languages.ts` — ENGINE_LANGUAGES - `app/src/components/Generation/EngineModelSelector.tsx` — ENGINE_OPTIONS, ENGINE_DESCRIPTIONS - `app/src/lib/hooks/useGenerationForm.ts` — Zod schema, model-name mapping - `app/src/components/ServerSettings/ModelManagement.tsx` — MODEL_DESCRIPTIONS **Dependencies (Phase 4):** - `backend/requirements.txt` - `justfile` (setup-python, setup-python-release targets) - `.github/workflows/release.yml` - `Dockerfile` (if applicable) ### 4. PyInstaller bundling (Phase 5) Register the engine in `backend/build_binary.py`: - `--hidden-import` for the backend module and model package - `--collect-all` for packages using `inspect.getsource`, shipping data files, or native libraries - `--copy-metadata` for packages using `importlib.metadata` If the engine has native data paths, add `os.environ.setdefault()` in `backend/server.py` inside the `if getattr(sys, 'frozen', False):` block. ### 5. Verify in dev mode ```bash just dev ``` Test the full chain: model download → load → generate → voice cloning. ### 6. Use the checklist Walk through the Implementation Checklist at the bottom of `tts-engines.mdx`. Every item must be checked before handing the build to the user. ## Key Lessons (from v0.2.3) These are the most common failure modes. Phase 0 research catches all of them: | Pattern | Symptom in Frozen Build | Fix | |---------|------------------------|-----| | `@typechecked` / `inspect.getsource()` | "could not get source code" | `--collect-all ` | | Package ships pretrained model files | `FileNotFoundError` for `.pth.tar`, `.yaml` | `--collect-all ` | | C library with hardcoded system paths | `FileNotFoundError` for `/usr/share/...` | `--collect-all` + env var in `server.py` | | `importlib.metadata.version()` | "No package metadata found" | `--copy-metadata ` | | `torch.load` without `map_location` | CUDA device not available on CPU build | Monkey-patch `torch.load` | | `torch.from_numpy` on float64 data | dtype mismatch RuntimeError | Cast to `.float()` | | `token=True` in HF download calls | Auth failure without stored HF token | Use `snapshot_download(token=None)` + `from_local()` | ## Notes - The route and service layers have zero per-engine dispatch points. `main.py` requires zero changes. - The model config registry in `backends/__init__.py` handles all dispatch automatically. - Use `get_torch_device()` and `model_load_progress()` from `backends/base.py` — don't reimplement device detection or progress tracking. - Always test with a **clean HuggingFace cache** (no pre-downloaded models from dev). - Do NOT push or create a release. Hand the build to the user for local testing. ================================================ FILE: .agents/skills/draft-release-notes/SKILL.md ================================================ --- name: draft-release-notes description: Use this skill to draft or update the [Unreleased] section of CHANGELOG.md from the actual changes since the last tag. Run this at any point during development to keep a working copy of the release narrative. Does NOT bump versions or create tags. --- # Draft Release Notes ## Goal Update the `[Unreleased]` section at the top of `CHANGELOG.md` with a narrative release story based on the real changes since the last tag. This is a **non-destructive working copy** — run it as many times as you want during development. ## Workflow 1. **Identify the last release tag and gather changes.** ```bash LAST_TAG=$(git tag --list "v*" --sort=-v:refname | head -n 1) echo "Last tag: $LAST_TAG" ``` Then collect raw material from three sources: a. **Commit log since last tag:** ```bash git log --oneline "$LAST_TAG"..HEAD ``` b. **GitHub-generated release notes preview** (PR titles, new contributors): ```bash gh api repos/:owner/:repo/releases/generate-notes \ -f tag_name="vNEXT" \ -f target_commitish="$(git rev-parse HEAD)" \ -f previous_tag_name="$LAST_TAG" \ --jq '.body' ``` c. **Diff stat for theme analysis:** ```bash git diff --stat "$LAST_TAG"..HEAD ``` 2. **Draft the release narrative.** Write markdown for the `[Unreleased]` section following the format below. Do not include the `## [Unreleased]` heading itself — just the body content. 3. **Update CHANGELOG.md.** Replace everything between `## [Unreleased]` and the next `## [` heading with the new draft. Preserve the HTML comment header and all existing release sections below. The `[Unreleased]` section must always exist and always be the first section after the header comments. 4. **Do NOT commit, tag, or bump versions.** Just leave the file modified in the working tree. ## Release Story Format Structure the `[Unreleased]` section like this: ```markdown ## [Unreleased] ### - Bullet points with specifics - Reference PRs where available: ([#123](https://github.com/jamiepine/voicebox/pull/123)) ### - ... ### Bug Fixes - ... ``` ### Style Guidelines - **Factual and specific.** Every claim should trace to a real commit or PR. - **Narrative over list.** Lead with paragraphs that tell the story, then support with bullets. - **Group by theme, not by commit.** Cluster related changes under descriptive headings. - **Reference PRs** where they exist, but don't fabricate them. - **Skip trivial chores** (typo fixes, CI tweaks) unless they're the bulk of the release. - **Match the voice of existing releases** — look at the v0.2.1 and v0.2.3 entries in CHANGELOG.md for tone reference. ## When There Are No Changes If `git log "$LAST_TAG"..HEAD` is empty, leave the `[Unreleased]` section empty (just the heading) and tell the user there's nothing to draft. ## Notes - This skill only touches the `[Unreleased]` section. It never modifies stamped release sections. - The agent can be asked to run this skill at any point — mid-feature, before a PR, or right before cutting a release. - The `release-bump` skill depends on this draft being up to date before it finalizes. ================================================ FILE: .agents/skills/release-bump/SKILL.md ================================================ --- name: release-bump description: Use this skill to finalize a release. It stamps the [Unreleased] changelog section with a version and date, runs bumpversion to update all version files, and creates the release commit and tag. Only run this when you're ready to ship. --- # Release Bump ## Goal Finalize the changelog draft, bump the version across all tracked files, and create a tagged release commit. After this skill runs, the repo has a clean release commit and tag ready to push. ## Prerequisites - `gh` CLI installed and authenticated (`gh auth status`). - `bumpversion` installed (`pip install bumpversion` or available in the project venv). - The `[Unreleased]` section of `CHANGELOG.md` should already contain the release narrative. If it's empty or stale, run the `draft-release-notes` skill first. ## Workflow 1. **Verify the working tree is clean** (except `CHANGELOG.md` which may have the draft). ```bash git status --porcelain ``` Only `CHANGELOG.md` (and optionally `.agents/` files) should be modified. If there are other uncommitted changes, stop and ask the user to commit or stash them first. 2. **Determine the bump level.** Ask the user if not specified: `patch`, `minor`, or `major`. Check the current version: ```bash grep '^current_version' .bumpversion.cfg ``` 3. **Stamp the changelog.** Read the current `[Unreleased]` content from `CHANGELOG.md`. Compute the new version (based on bump level and current version). Then: a. Replace the `## [Unreleased]` section body with an empty placeholder. b. Insert a new stamped section immediately after `## [Unreleased]`: ```markdown ## [Unreleased] ## [X.Y.Z] - YYYY-MM-DD ``` c. Update the reference links at the bottom of the file: - Change the `[Unreleased]` link to compare against the new tag - Add a new link for the new version ```markdown [Unreleased]: https://github.com/jamiepine/voicebox/compare/vX.Y.Z...HEAD [X.Y.Z]: https://github.com/jamiepine/voicebox/compare/vPREVIOUS...vX.Y.Z ``` 4. **Stage the changelog.** ```bash git add CHANGELOG.md ``` 5. **Run bumpversion.** ```bash bumpversion --allow-dirty ``` The `--allow-dirty` flag is needed because `CHANGELOG.md` is already staged. bumpversion will: - Update version strings in all tracked files (see `.bumpversion.cfg`) - Create a commit with message `Bump version: X.Y.Z -> A.B.C` - Create a tag `vA.B.C` The staged `CHANGELOG.md` will be included in this commit automatically. 6. **Verify results.** ```bash git show --name-only --stat HEAD git tag --list "v*" --sort=-v:refname | head -n 5 ``` Confirm the commit contains: - `CHANGELOG.md` - `.bumpversion.cfg` - `tauri/src-tauri/tauri.conf.json` - `tauri/src-tauri/Cargo.toml` - `package.json` - `app/package.json` - `tauri/package.json` - `landing/package.json` - `web/package.json` - `backend/__init__.py` Confirm the new tag exists. 7. **Do NOT push** unless the user explicitly asks. Report the tag name and suggest: ``` Ready to push. When you're ready: git push origin main --follow-tags ``` ## Version Calculation Reference Given current version `X.Y.Z`: - `patch` -> `X.Y.(Z+1)` - `minor` -> `X.(Y+1).0` - `major` -> `(X+1).0.0` ## Error Recovery - If bumpversion fails, the tag won't exist. Fix the issue and re-run — bumpversion is idempotent as long as the tag doesn't already exist. - If you need to undo a release commit (before pushing): `git tag -d vX.Y.Z && git reset --soft HEAD~1` - Never amend a release commit that has been pushed. ## Notes - When the tag is pushed, the release CI (`.github/workflows/release.yml`) automatically extracts the matching version section from `CHANGELOG.md` and uses it as the GitHub Release body. No manual copy-paste needed. - The release commit message is controlled by `.bumpversion.cfg` (`Bump version: X.Y.Z -> A.B.C`). Do not override it. - If you need to manually update the GitHub Release body after the fact: `gh release edit vX.Y.Z --notes-file <(sed -n '/## \[X.Y.Z\]/,/## \[/p' CHANGELOG.md | head -n -1)` ================================================ FILE: .biomeignore ================================================ # Dependencies node_modules bun.lockb # Build outputs dist target .tauri # Generated files app/src/lib/api # Config files (don't lint/format) *.config.js *.config.ts # Tailwind CSS files (contains @tailwind directives) **/index.css ================================================ FILE: .bumpversion.cfg ================================================ [bumpversion] current_version = 0.3.1 commit = True tag = True tag_name = v{new_version} tag_message = Release v{new_version} message = Bump version: {current_version} → {new_version} [bumpversion:file:tauri/src-tauri/tauri.conf.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:tauri/src-tauri/Cargo.toml] search = version = "{current_version}" replace = version = "{new_version}" [bumpversion:file:package.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:app/package.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:tauri/package.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:landing/package.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:web/package.json] search = "version": "{current_version}" replace = "version": "{new_version}" [bumpversion:file:backend/__init__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" ================================================ FILE: .dockerignore ================================================ # Version control .git .github .gitignore # Desktop-only (not needed in web container) tauri/ landing/ docs/ mlx-test/ scripts/ # Dependencies & build artifacts (rebuilt in Docker) node_modules/ __pycache__/ *.pyc *.pyo *.egg-info/ dist/ build/ *.spec # Data (will be bind-mounted) data/ backend/data/ # IDE & OS .vscode/ .idea/ *.swp *.swo .DS_Store Thumbs.db # Config files not needed in container biome.json .biomeignore .bumpversion.cfg .npmrc Makefile CHANGELOG.md CONTRIBUTING.md SECURITY.md LICENSE README.md backend/README.md ================================================ FILE: .github/workflows/build-windows.yml ================================================ name: Build Windows on: workflow_dispatch: jobs: build-windows: permissions: contents: write runs-on: windows-latest steps: - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.12" cache: "pip" - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install pyinstaller pip install -r backend/requirements.txt - name: Build Python server shell: bash run: | cd backend python build_binary.py PLATFORM=$(rustc --print host-tuple) mkdir -p ../tauri/src-tauri/binaries cp dist/voicebox-server.exe ../tauri/src-tauri/binaries/voicebox-server-${PLATFORM}.exe echo "Built voicebox-server-${PLATFORM}.exe" - name: Setup Bun uses: oven-sh/setup-bun@v2 - name: Install Rust stable uses: dtolnay/rust-toolchain@stable - name: Rust cache uses: swatinem/rust-cache@v2 with: workspaces: "./tauri/src-tauri -> target" - name: Install dependencies run: bun install - uses: tauri-apps/tauri-action@v0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: projectPath: tauri tagName: v__VERSION__ releaseName: "voicebox v__VERSION__ (test build)" releaseBody: "Test build for audio export fix" releaseDraft: true prerelease: true args: "" includeUpdaterJson: false ================================================ FILE: .github/workflows/release.yml ================================================ name: Release on: workflow_dispatch: push: tags: - "v*" jobs: release: permissions: contents: write strategy: fail-fast: false matrix: include: - platform: "macos-latest" args: "--target aarch64-apple-darwin" python-version: "3.12" backend: "mlx" - platform: "macos-15-intel" args: "--target x86_64-apple-darwin" python-version: "3.12" backend: "pytorch" - platform: "windows-latest" args: "" python-version: "3.12" backend: "pytorch" runs-on: ${{ matrix.platform }} steps: - uses: actions/checkout@v4 - name: Install dependencies (ubuntu only) if: contains(matrix.platform, 'ubuntu') || contains(matrix.platform, 'namespace') run: | sudo apt-get update sudo apt-get install -y libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf llvm-dev libasound2-dev - name: Install LLVM (macOS) if: matrix.platform == 'macos-latest' || matrix.platform == 'macos-15-intel' run: | brew install llvm@20 echo "$(brew --prefix llvm@20)/bin" >> $GITHUB_PATH echo "LLVM_CONFIG=$(brew --prefix llvm@20)/bin/llvm-config" >> $GITHUB_ENV - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" - name: Install CPU-only PyTorch (Linux) if: contains(matrix.platform, 'ubuntu') || contains(matrix.platform, 'namespace') run: | pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts pip install --no-deps hume-tada - name: Install MLX dependencies (Apple Silicon only) if: matrix.backend == 'mlx' run: | pip install -r backend/requirements-mlx.txt - name: Build Python server (Linux/macOS) if: matrix.platform != 'windows-latest' run: | chmod +x scripts/build-server.sh ./scripts/build-server.sh - name: Build Python server (Windows) if: matrix.platform == 'windows-latest' shell: bash run: | cd backend python build_binary.py # Get platform tuple PLATFORM=$(rustc --print host-tuple) # Create binaries directory mkdir -p ../tauri/src-tauri/binaries # Copy with platform suffix cp dist/voicebox-server.exe ../tauri/src-tauri/binaries/voicebox-server-${PLATFORM}.exe echo "Built voicebox-server-${PLATFORM}.exe" - name: Setup Bun uses: oven-sh/setup-bun@v2 - name: Install Rust stable uses: dtolnay/rust-toolchain@stable with: targets: ${{ (matrix.platform == 'macos-latest' && 'aarch64-apple-darwin') || (matrix.platform == 'macos-15-intel' && 'x86_64-apple-darwin') || '' }} - name: Rust cache uses: swatinem/rust-cache@v2 with: workspaces: "./tauri/src-tauri -> target" - name: Install dependencies run: bun install - name: Install Apple API key if: matrix.platform == 'macos-latest' || matrix.platform == 'macos-15-intel' run: | mkdir -p ~/.appstoreconnect/private_keys/ cd ~/.appstoreconnect/private_keys/ echo ${{ secrets.APPLE_API_KEY_BASE64 }} >> AuthKey_${{ secrets.APPLE_API_KEY }}.p8.base64 base64 --decode -i AuthKey_${{ secrets.APPLE_API_KEY }}.p8.base64 -o AuthKey_${{ secrets.APPLE_API_KEY }}.p8 rm AuthKey_${{ secrets.APPLE_API_KEY }}.p8.base64 - name: Install Codesigning Certificate if: matrix.platform == 'macos-latest' || matrix.platform == 'macos-15-intel' uses: apple-actions/import-codesign-certs@v3 with: p12-file-base64: ${{ secrets.APPLE_CERTIFICATE }} p12-password: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }} - name: Extract release notes from CHANGELOG.md id: changelog shell: bash run: | # Get the version from the tag (strip leading 'v') VERSION="${GITHUB_REF_NAME#v}" # Extract the section for this version from CHANGELOG.md # Matches from "## [X.Y.Z]" until the next "## [" heading NOTES=$(sed -n "/^## \[${VERSION}\]/,/^## \[/{/^## \[${VERSION}\]/d;/^## \[/d;p;}" CHANGELOG.md) # Fall back to a placeholder if the version isn't in the changelog if [ -z "$(echo "$NOTES" | tr -d '[:space:]')" ]; then NOTES="See the assets below to download and install this version." fi # Use multiline output syntax { echo "notes<> "$GITHUB_OUTPUT" - uses: tauri-apps/tauri-action@v0.6 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} TAURI_SIGNING_PRIVATE_KEY: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY }} TAURI_SIGNING_PRIVATE_KEY_PASSWORD: ${{ secrets.TAURI_SIGNING_PRIVATE_KEY_PASSWORD }} ENABLE_CODE_SIGNING: ${{ secrets.APPLE_CERTIFICATE }} APPLE_CERTIFICATE: ${{ secrets.APPLE_CERTIFICATE }} APPLE_CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }} APPLE_SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }} APPLE_PROVIDER_SHORT_NAME: ${{ secrets.APPLE_PROVIDER_SHORT_NAME }} APPLE_API_ISSUER: ${{ secrets.APPLE_API_ISSUER }} APPLE_API_KEY: ${{ secrets.APPLE_API_KEY }} with: projectPath: tauri tagName: v__VERSION__ releaseName: "voicebox v__VERSION__" releaseBody: ${{ steps.changelog.outputs.notes }} releaseDraft: true prerelease: false args: ${{ matrix.args }} includeUpdaterJson: true build-cuda-windows: runs-on: windows-latest permissions: contents: write steps: - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.12" cache: "pip" - name: Install Python dependencies run: | python -m pip install --upgrade pip pip install pyinstaller pip install -r backend/requirements.txt pip install --no-deps chatterbox-tts pip install --no-deps hume-tada - name: Install PyTorch with CUDA 12.8 run: | pip install torch --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps pip install torchaudio --index-url https://download.pytorch.org/whl/cu128 --force-reinstall --no-deps - name: Verify CUDA support in torch run: | python -c "import torch; print(f'CUDA available in build: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}')" - name: Build CUDA server binary (onedir) shell: bash working-directory: backend run: python build_binary.py --cuda - name: Package into server core + CUDA libs archives shell: bash run: | python scripts/package_cuda.py \ backend/dist/voicebox-server-cuda/ \ --output release-assets/ \ --cuda-libs-version cu128-v1 \ --torch-compat ">=2.7.0,<2.11.0" - name: Upload archives to GitHub Release if: startsWith(github.ref, 'refs/tags/') uses: softprops/action-gh-release@v2 with: files: | release-assets/voicebox-server-cuda.tar.gz release-assets/voicebox-server-cuda.tar.gz.sha256 release-assets/cuda-libs-cu128-v1.tar.gz release-assets/cuda-libs-cu128-v1.tar.gz.sha256 release-assets/cuda-libs.json draft: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload onedir as workflow artifact uses: actions/upload-artifact@v4 with: name: voicebox-server-cuda-windows path: backend/dist/voicebox-server-cuda/ retention-days: 7 ================================================ FILE: .gitignore ================================================ # Dependencies node_modules/ bun.lockb __pycache__/ *.py[cod] *$py.class *.so .Python venv/ env/ ENV/ *.prompt # Build outputs dist/ build/ *.egg-info/ *.egg target/ *.app *.dmg *.exe *.msi *.deb *.AppImage # IDE .vscode/ .idea/ *.swp *.swo *~ # OS .DS_Store Thumbs.db # Data (user-generated) data/ !data/.gitkeep # Logs *.log logs/ # Environment .env .env.local # Generated files app/openapi.json tauri/src-tauri/binaries/* tauri/src-tauri/gen/Assets.car tauri/src-tauri/gen/voicebox.icns tauri/src-tauri/gen/partial.plist # PyInstaller *.spec # Windows artifacts nul # Temporary tmp/ temp/ *.tmp ================================================ FILE: .npmrc ================================================ # Force bun usage engine-strict=true ================================================ FILE: CHANGELOG.md ================================================ # Changelog ## [Unreleased] ## [0.3.0] - 2026-03-17 This release rewrites the backend into a modular architecture, overhauls the settings UI into routed sub-pages, fixes audio player freezing, migrates documentation to Fumadocs, and ships a batch of bug fixes targeting the most-reported issues from the tracker. The backend's 3,000-line monolith `main.py` has been decomposed into domain routers, a services layer, and a proper database package. A style guide and ruff configuration now enforce consistency. On the frontend, settings have been split into dedicated routed pages with server logs, a changelog viewer, and an about page. The audio player no longer freezes mid-playback, and model loading status is now visible in the UI. Seven user-reported bugs have been fixed, including server crashes during sample uploads, generation list staleness, cryptic error messages, and CUDA support for RTX 50-series GPUs. ### Settings Overhaul ([#294](https://github.com/jamiepine/voicebox/pull/294)) - Split settings into routed sub-tabs: General, Generation, GPU, Logs, Changelog, About - Added live server log viewer with auto-scroll - Added in-app changelog page that parses `CHANGELOG.md` at build time - Added About page with version info, license, and generation folder quick-open - Extracted reusable `SettingRow` component for consistent setting layouts ### Audio Player Fix ([#293](https://github.com/jamiepine/voicebox/pull/293)) - Fixed audio player freezing during playback - Improved playback UX with better state management and listener cleanup - Fixed restart race condition during regeneration - Added stable keys for audio element re-rendering - Improved accessibility across player controls ### Backend Refactor ([#285](https://github.com/jamiepine/voicebox/pull/285)) - Extracted all routes from `main.py` into 13 domain routers under `backend/routes/` — `main.py` dropped from ~3,100 lines to ~10 - Moved CRUD and service modules into `backend/services/`, platform detection into `backend/utils/` - Split monolithic `database.py` into a `database/` package with separate `models`, `session`, `migrations`, and `seed` modules - Added `backend/STYLE_GUIDE.md` and `pyproject.toml` with ruff linting config - Removed dead code: unused `_get_cuda_dll_excludes`, stale `studio.py`, `example_usage.py`, old `Makefile` - Deduplicated shared logic across TTS backends into `backends/base.py` - Improved startup logging with version, platform, data directory, and database stats - Fixed startup database session leak — sessions now rollback and close in `finally` block - Isolated shutdown unload calls so one backend failure doesn't block the others - Handled null duration in `story_items` migration - Reject model migration when target is a subdirectory of source cache ### Documentation Rewrite ([#288](https://github.com/jamiepine/voicebox/pull/288)) - Migrated docs site from Mintlify to Fumadocs (Next.js-based) - Rewrote introduction and root page with content from README - Added "Edit on GitHub" links and last-updated timestamps on all pages - Generated OpenAPI spec and auto-generated API reference pages - Removed stale planning docs (`CUDA_BACKEND_SWAP`, `EXTERNAL_PROVIDERS`, `MLX_AUDIO`, `TTS_PROVIDER_ARCHITECTURE`, etc.) - Sidebar groups now expand by default; root redirects to `/docs` - Added OG image metadata and `/og` preview page ### UI & Frontend - Added model loading status indicator and effects preset dropdown ([3187344](https://github.com/jamiepine/voicebox/commit/3187344)) - Fixed take-label race condition during regeneration - Added accessible focus styling to select component - Softened select focus indicator opacity - Addressed 4 critical and 12 major issues from CodeRabbit review ### Bug Fixes ([#295](https://github.com/jamiepine/voicebox/pull/295)) - Fixed sample uploads crashing the server — audio decoding now runs in a thread pool instead of blocking the async event loop ([#278](https://github.com/jamiepine/voicebox/issues/278)) - Fixed generation list not updating when a generation completes — switched to `refetchQueries` for reliable cache busting, added SSE error fallback, and page reset on completion ([#231](https://github.com/jamiepine/voicebox/issues/231)) - Fixed error toasts showing `[object Object]` instead of the actual error message ([#290](https://github.com/jamiepine/voicebox/issues/290)) - Added Whisper model selection (`base`, `small`, `medium`, `large`, `turbo`) and expanded language support to the `/transcribe` endpoint ([#233](https://github.com/jamiepine/voicebox/issues/233)) - Upgraded CUDA backend build from cu121 to cu126 for RTX 50-series (Blackwell) GPU support ([#289](https://github.com/jamiepine/voicebox/issues/289)) - Handled client disconnects in SSE and streaming endpoints to suppress `[Errno 32] Broken Pipe` errors ([#248](https://github.com/jamiepine/voicebox/issues/248)) - Fixed Docker build failure from pip hash mismatch on Qwen3-TTS dependencies ([#286](https://github.com/jamiepine/voicebox/issues/286)) - Added 50 MB upload size limit with chunked reads to prevent unbounded memory allocation on sample uploads - Eliminated redundant double audio decode in sample processing pipeline ### Platform Fixes - Replaced `netstat` with `TcpStream` + PowerShell for Windows port detection ([#277](https://github.com/jamiepine/voicebox/pull/277)) - Fixed Docker frontend build and cleaned up Docker docs - Fixed macOS download links to use `.dmg` instead of `.app.tar.gz` - Added dynamic download redirect routes to landing site ### Release Tooling - Added `draft-release-notes` and `release-bump` agent skills - Wired CI release workflow to extract notes from `CHANGELOG.md` for GitHub Releases - Backfilled changelog with all historical releases ## [0.2.3] - 2026-03-15 The "it works in dev but not in prod" release. This version fixes a series of PyInstaller bundling issues that prevented model downloading, loading, generation, and progress tracking from working in production builds. ### Model Downloads Now Actually Work The v0.2.1/v0.2.2 builds could not download or load models that weren't already cached from a dev install. This release fixes the entire chain: - **Chatterbox, Chatterbox Turbo, and LuxTTS** all download, load, and generate correctly in bundled builds - **Real-time download progress** — byte-level progress bars now work in production. The root cause: `huggingface_hub` silently disables tqdm progress bars based on logger level, which prevented our progress tracker from receiving byte updates. We now force-enable the internal counter regardless. - **Fixed Python 3.12.0 `code.replace()` bug** — the macOS build was on Python 3.12.0, which has a [known CPython bug](https://github.com/pyinstaller/pyinstaller/issues/7992) that corrupts bytecode when PyInstaller rewrites code objects. This caused `NameError: name 'obj' is not defined` crashes during scipy/torch imports. Upgraded to Python 3.12.13. ### PyInstaller Fixes - Collect all `inflect` files — `typeguard`'s `@typechecked` decorator calls `inspect.getsource()` at import time, which needs `.py` source files, not just bytecode. Fixes LuxTTS "could not get source code" error. - Collect all `perth` files — bundles the pretrained watermark model (`hparams.yaml`, `.pth.tar`) needed by Chatterbox at runtime - Collect all `piper_phonemize` files — bundles `espeak-ng-data/` (phoneme tables, language dicts) needed by LuxTTS for text-to-phoneme conversion - Set `ESPEAK_DATA_PATH` in frozen builds so the espeak-ng C library finds the bundled data instead of looking at `/usr/share/espeak-ng-data/` - Collect all `linacodec` files — fixes `inspect.getsource` error in Vocos codec - Collect all `zipvoice` files — fixes source code lookup in LuxTTS voice cloning - Copy metadata for `requests`, `transformers`, `huggingface-hub`, `tokenizers`, `safetensors`, `tqdm` — fixes `importlib.metadata` lookups in frozen binary - Add hidden imports for `chatterbox`, `chatterbox_turbo`, `luxtts`, `zipvoice` backends - Add `multiprocessing.freeze_support()` to fix resource_tracker subprocess crash in frozen binary - `--noconsole` now only applied on Windows — macOS/Linux need stdout/stderr for Tauri sidecar log capture - Hardened `sys.stdout`/`sys.stderr` devnull redirect to test writability, not just `None` check ### Updater - Fixed updater artifact generation with `v1Compatible` for `tauri-action` signature files - Updated `tauri-action` to v0.6 to fix updater JSON and `.sig` generation ### Other Fixes - Full traceback logging on all backend model loading errors (was just `str(e)` before) ## [0.2.2] - 2026-03-15 - Fix Chatterbox model support in bundled builds - Fix LuxTTS/ZipVoice support in bundled builds - Auto-update CUDA binary when app version changes - CUDA download progress bar - Fix server process staying alive on macOS (SIGHUP handling, watchdog grace period) - Hide console window when running CUDA binary on Windows ## [0.2.1] - 2026-03-15 Voicebox v0.1.x was a single-engine voice cloning app built around Qwen3-TTS. v0.2.0 is a ground-up rethink: four TTS engines, 23 languages, paralinguistic emotion controls, a post-processing effects pipeline, unlimited generation length, an async generation queue, and support for every major GPU vendor. Plus Docker. ### New TTS Engines #### Multi-Engine Architecture Voicebox now runs **four independent TTS engines** behind a thread-safe per-engine backend registry. Switch engines per-generation from a single dropdown — no restart required. | Engine | Languages | Size | Key Strengths | | --------------------------- | --------- | ------- | --------------------------------------------- | | **Qwen3-TTS 1.7B** | 10 | ~3.5 GB | Highest quality, delivery instructions | | **Qwen3-TTS 0.6B** | 10 | ~1.2 GB | Lighter, faster variant | | **LuxTTS** | English | ~300 MB | CPU-friendly, 48 kHz output, 150x realtime | | **Chatterbox Multilingual** | 23 | ~3.2 GB | Broadest language coverage, zero-shot cloning | | **Chatterbox Turbo** | English | ~1.5 GB | 350M params, low latency, paralinguistic tags | #### Chatterbox Multilingual — 23 Languages ([#257](https://github.com/jamiepine/voicebox/pull/257)) Zero-shot voice cloning in Arabic, Chinese, Danish, Dutch, English, Finnish, French, German, Greek, Hebrew, Hindi, Italian, Japanese, Korean, Malay, Norwegian, Polish, Portuguese, Russian, Spanish, Swahili, Swedish, and Turkish. #### LuxTTS — Lightweight English TTS ([#254](https://github.com/jamiepine/voicebox/pull/254)) A fast, CPU-friendly English engine. ~300 MB download, 48 kHz output, runs at 150x realtime on CPU. #### Chatterbox Turbo — Expressive English ([#258](https://github.com/jamiepine/voicebox/pull/258)) A fast 350M-parameter English model with inline paralinguistic tags. #### Paralinguistic Tags Autocomplete ([#265](https://github.com/jamiepine/voicebox/pull/265)) Type `/` in the text input with Chatterbox Turbo selected to open an autocomplete for **9 expressive tags**: `[laugh]` `[chuckle]` `[gasp]` `[cough]` `[sigh]` `[groan]` `[sniff]` `[shush]` `[clear throat]` ### Generation #### Unlimited Generation Length — Auto-Chunking ([#266](https://github.com/jamiepine/voicebox/pull/266)) Long text is now automatically split at sentence boundaries, generated per-chunk, and crossfaded back together. Engine-agnostic. - Auto-chunking limit slider — 100–5,000 chars (default 800) - Crossfade slider — 0–200ms (default 50ms) - Max text length raised to 50,000 characters - Smart splitting respects abbreviations, CJK punctuation, and `[tags]` #### Asynchronous Generation Queue ([#269](https://github.com/jamiepine/voicebox/pull/269)) Generation is now fully non-blocking. Serial execution queue prevents GPU contention. Real-time SSE status streaming. #### Generation Versions Every generation now supports multiple versions with provenance tracking — original, effects versions, takes, source tracking, version pinning in stories, and favorites. ### Post-Processing Effects ([#271](https://github.com/jamiepine/voicebox/pull/271)) A full audio effects system powered by Spotify's `pedalboard` library: Pitch Shift, Reverb, Delay, Chorus/Flanger, Compressor, Gain, High-Pass Filter, Low-Pass Filter. 4 built-in presets, custom presets, per-profile default effects, and live preview. ### Platform Support - **Windows Support** ([#272](https://github.com/jamiepine/voicebox/pull/272)) — Full Windows support with CUDA GPU detection - **Linux** ([#262](https://github.com/jamiepine/voicebox/pull/262)) — AMD ROCm, NVIDIA GBM fix, WebKitGTK mic access (build from source) - **NVIDIA CUDA Backend Swap** ([#252](https://github.com/jamiepine/voicebox/pull/252)) — Download and swap in CUDA backend from within the app - **Intel Arc (XPU) and DirectML** — PyTorch backend supports Intel Arc and DirectML - **Docker + Web Deployment** ([#161](https://github.com/jamiepine/voicebox/pull/161)) — 3-stage build, non-root runtime, health checks - **Whisper Turbo** — Added `openai/whisper-large-v3-turbo` as a transcription model option ### Model Management ([#268](https://github.com/jamiepine/voicebox/pull/268)) Per-model unload, custom models directory, model folder migration, download cancel/clear UI ([#238](https://github.com/jamiepine/voicebox/pull/238)), restructured settings UI. ### Security & Reliability - CORS hardening ([#88](https://github.com/jamiepine/voicebox/pull/88)) - Network access toggle ([#133](https://github.com/jamiepine/voicebox/pull/133)) - Offline crash fix ([#152](https://github.com/jamiepine/voicebox/pull/152)) - Atomic audio saves ([#263](https://github.com/jamiepine/voicebox/pull/263)) - Filesystem health endpoint - Chatterbox float64 dtype fix ([#264](https://github.com/jamiepine/voicebox/pull/264)) ### Accessibility ([#243](https://github.com/jamiepine/voicebox/pull/243)) Screen reader support, keyboard navigation, state-aware `aria-label` attributes on all interactive controls. ### UI Polish - Redesigned landing page ([#274](https://github.com/jamiepine/voicebox/pull/274)) - Voices tab overhaul with inline inspector - Responsive layout improvements - Duplicate profile name validation ([#175](https://github.com/jamiepine/voicebox/pull/175)) ### Community Contributors [@haosenwang1018](https://github.com/haosenwang1018), [@Balneario-de-Cofrentes](https://github.com/Balneario-de-Cofrentes), [@ageofalgo](https://github.com/ageofalgo), [@mikeswann](https://github.com/mikeswann), [@rayl15](https://github.com/rayl15), [@mpecanha](https://github.com/mpecanha), [@ways2read](https://github.com/ways2read), [@ieguiguren](https://github.com/ieguiguren), [@Vaibhavee89](https://github.com/Vaibhavee89), [@pandego](https://github.com/pandego), [@luminest-llc](https://github.com/luminest-llc) ## [0.1.13] - 2026-02-23 ### Stability and reliability - [#95](https://github.com/jamiepine/voicebox/pull/95) Fix: selecting 0.6B model still downloads and uses 1.7B - [#93](https://github.com/jamiepine/voicebox/pull/93) fix(mlx): bundle native libs and broaden error handling for Apple Silicon - [#79](https://github.com/jamiepine/voicebox/pull/79) fix: handle non-ASCII filenames in Content-Disposition headers - [#78](https://github.com/jamiepine/voicebox/pull/78) fix: guard getUserMedia call against undefined mediaDevices in non-secure contexts - [#77](https://github.com/jamiepine/voicebox/pull/77) fix: await for confirmation before deleting voices and channels - [#128](https://github.com/jamiepine/voicebox/pull/128) fix: resolve multiple issues (#96, #119, #111, #108, #121, #125, #127) - [#40](https://github.com/jamiepine/voicebox/pull/40) Fix: audio export path resolution ### Build and packaging - [#122](https://github.com/jamiepine/voicebox/pull/122) fix(web): add @tailwindcss/vite plugin to web config - [#126](https://github.com/jamiepine/voicebox/pull/126) Create requirements.txt ### UX and docs - [#44](https://github.com/jamiepine/voicebox/pull/44) Enhances floating generate box UX - [#57](https://github.com/jamiepine/voicebox/pull/57) chore: updates repo URL in README - [#146](https://github.com/jamiepine/voicebox/pull/146) Add Spacebot banner to landing page - [#1](https://github.com/jamiepine/voicebox/pull/1) Improvements ## [0.1.12] - 2026-01-31 ### Model Download UX Overhaul - Real-time download progress tracking with accurate percentage and speed info - No more downloading notifications during generation even when its not downloading - Better error handling and status reporting throughout the download process ### Other Improvements - Enhanced health check endpoint with GPU type information - Improved model caching verification - More reliable SSE progress updates - Actual update notifications — no need to manually check in settings anymore ## [0.1.11] - 2026-01-30 - Fixed transcriptions on MLX - Fixed model download progress (finally) ## [0.1.10] - 2026-01-30 ### Faster generation on Apple Silicon Massive speed gains, from around 20s per generation to 2-3s. Added native MLX backend support for Apple Silicon, providing significantly faster TTS and STT generation on M-series macOS machines. - **MLX Backend** — New backend implementation optimized for Apple Silicon using MLX framework - **Dynamic Backend Selection** — Automatically detects platform and selects between MLX (macOS) and PyTorch (other platforms) - Refactored TTS and STT logic into modular backend implementations - Updated build process to include MLX-specific dependencies for macOS builds ## [0.1.9] - 2026-01-30 ### Improved voice profile creation flow - Voice create drafts: No longer lose work if you close the modal - Fixed whisper only transcribing English or Chinese, now has support for all languages ### Improved Stories editor - Added spacebar for play/pause - Timeline now auto-scrolls to follow playhead during playback - Fixed misalignment of the items with mouse when picking up - Fixed hitbox for selecting an item - Fixed playhead jumping forward when pressing play ### Generation box improvements - Instruct mode no longer wipes prompt text - Improved UI cleanliness ### Misc - Fixed "Model downloading" toast during generation when model is already downloaded ## [0.1.8] - 2026-01-29 ### Model Download Timeout Issues Fixed critical issue where model downloads would fail with "Failed to fetch" errors on Windows. Refactored download endpoints to return immediately and continue downloads in background. ### Cross-Platform Cache Path Issues Fixed hardcoded `~/.cache/huggingface/hub` paths that don't work on Windows. All cache paths now use `hf_constants.HF_HUB_CACHE` for proper cross-platform support. ### Windows Process Management - Added `/shutdown` endpoint for graceful server shutdown on Windows - Added `gpu_type` field to health check response ## [0.1.7] - 2026-01-29 - Trim and split audio clips in Story Editor - Auto-activation of stories in Story Editor with visible playhead - Conditional auto-play support in AudioPlayer for better user control - Refactored audio loading across HistoryTable, SampleList, and generation forms - Audio now only auto-plays when explicitly intended, preventing unexpected playback ## [0.1.6] - 2026-01-29 ### Introducing Stories A full voice editor for composing podcasts and generated conversations. - **Stories Editor** — Create multi-voice narratives, podcasts, or conversations with a timeline-based editor - Compose tracks with different voices - Edit and arrange audio segments inline - Build generated conversations with multiple participants - **Improved Voice Generation UI** — Auto-resizing input, default voice selection, better layout - **Track Editor Integration** — Inline track editing within story items ## [0.1.5] - 2026-01-28 Fixed recording length limit at 0:29 to auto stop instead of passing the limit and getting an error, which would cause users to lose their recording. ## [0.1.4] - 2026-01-28 - Audio channel management system - Native audio playback handling in AudioPlayer component - Refactored ConnectionForm and Checkbox components - Improved layout consistency and responsiveness - Added safe area constants for better responsive design ## [0.1.3] - 2026-01-27 - Improved the generate textbox - Maybe fixed Windows autoupdate restarting entire computer ## [0.1.2] - 2026-01-27 ### Audio Capture & Format Conversion - Added audio format conversion util - Enhanced system audio capture on macOS and Windows - Improved audio recording hooks - Added audio input entitlement for macOS - Added audio capture tests ### Update System - Enhanced auto-updater functionality and update status display ## [0.1.1] - 2026-01-27 ### Platform Support - **macOS Audio Capture** — Native audio capture support for sample creation - **Windows Audio Capture** — WASAPI implementation with improved thread safety - **Linux Support** — Temporarily removed builds due to runner disk space constraints ### Audio Features - Play/pause for audio samples across all components - Three new sample components: Recording, System capture, Upload with drag-and-drop - Audio validation, error handling, and consistent cleanup ### Voice Profile Management - Profile import with file size validation (100MB limit) - Enhanced profile form with new audio sample components - Drag-and-drop support for audio file uploads ### Server Management - Changed default URL from `localhost:8000` to `127.0.0.1:17493` - Server reuse logic, "keep server running" preference, orphaned process handling ### Build & Release - Added `.bumpversion.cfg` for automated version management - Enhanced icon generation script for multi-size Windows icons ### Bug Fixes - Fixed date formatting for timezone-less date strings - Fixed getLatestRelease file filtering - Improved audio duration metadata on Windows ## [0.1.0] - 2026-01-27 The first public release of Voicebox — an open-source voice synthesis studio powered by Qwen3-TTS. ### Voice Cloning with Qwen3-TTS - Automatic model download from HuggingFace - Multiple model sizes (1.7B and 0.6B) - Voice prompt caching for instant regeneration - English and Chinese support ### Voice Profile Management - Create profiles from audio files or record directly in the app - Multiple samples per profile for higher quality cloning - Import/Export profiles - Automatic transcription via Whisper ### Speech Generation - Simple text-to-speech with profile selection - Seed control for reproducible generations - Long-form support up to 5,000 characters ### Generation History - Full history with metadata - Search by text content - Inline playback and download ### Flexible Deployment - Local mode with bundled backend - Remote mode for GPU servers on your network - One-click server setup ### Desktop Experience - Built with Tauri v2 (Rust) — native performance, not Electron - Cross-platform: macOS and Windows - No Python installation required ### Tech Stack Tauri v2, React, TypeScript, Tailwind CSS, FastAPI, Qwen3-TTS, Whisper, SQLite [Unreleased]: https://github.com/jamiepine/voicebox/compare/v0.2.3...HEAD [0.2.3]: https://github.com/jamiepine/voicebox/compare/v0.2.2...v0.2.3 [0.2.2]: https://github.com/jamiepine/voicebox/compare/v0.2.1...v0.2.2 [0.2.1]: https://github.com/jamiepine/voicebox/compare/v0.1.13...v0.2.1 [0.1.13]: https://github.com/jamiepine/voicebox/compare/v0.1.12...v0.1.13 [0.1.12]: https://github.com/jamiepine/voicebox/compare/v0.1.11...v0.1.12 [0.1.11]: https://github.com/jamiepine/voicebox/compare/v0.1.10...v0.1.11 [0.1.10]: https://github.com/jamiepine/voicebox/compare/v0.1.9...v0.1.10 [0.1.9]: https://github.com/jamiepine/voicebox/compare/v0.1.8...v0.1.9 [0.1.8]: https://github.com/jamiepine/voicebox/compare/v0.1.7...v0.1.8 [0.1.7]: https://github.com/jamiepine/voicebox/compare/v0.1.6...v0.1.7 [0.1.6]: https://github.com/jamiepine/voicebox/compare/v0.1.5...v0.1.6 [0.1.5]: https://github.com/jamiepine/voicebox/compare/v0.1.4...v0.1.5 [0.1.4]: https://github.com/jamiepine/voicebox/compare/v0.1.3...v0.1.4 [0.1.3]: https://github.com/jamiepine/voicebox/compare/v0.1.2...v0.1.3 [0.1.2]: https://github.com/jamiepine/voicebox/compare/v0.1.1...v0.1.2 [0.1.1]: https://github.com/jamiepine/voicebox/compare/v0.1.0...v0.1.1 [0.1.0]: https://github.com/jamiepine/voicebox/releases/tag/v0.1.0 ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Voicebox Thank you for your interest in contributing to Voicebox! This document provides guidelines and instructions for contributing. ## Code of Conduct - Be respectful and inclusive - Welcome newcomers and help them learn - Focus on constructive feedback - Respect different viewpoints and experiences ## Getting Started ### Prerequisites - **[Bun](https://bun.sh)** - Fast JavaScript runtime and package manager ```bash curl -fsSL https://bun.sh/install | bash ``` - **[Python 3.11+](https://python.org)** - For backend development ```bash python --version # Should be 3.11 or higher ``` - **[Rust](https://rustup.rs)** - For Tauri desktop app (installed automatically by Tauri CLI) ```bash rustc --version # Check if installed ``` - **[Tauri Prerequisites](https://v2.tauri.app/start/prerequisites)** - Tauri-specific system dependencies (varies by OS). - **Git** - Version control ### Development Setup Install [just](https://github.com/casey/just) (`brew install just`, `cargo install just`, or `winget install Casey.Just`), then: ```bash git clone https://github.com/YOUR_USERNAME/voicebox.git cd voicebox just setup # creates venv, installs Python + JS deps just dev # starts backend + desktop app ``` `just setup` handles everything automatically, including: - Creating a Python virtual environment - Installing Python dependencies (with CUDA PyTorch on Windows if an NVIDIA GPU is detected) - Installing MLX dependencies on Apple Silicon - Installing JavaScript dependencies `just dev` starts the backend and desktop app together. If a backend is already running (e.g. from `just dev-backend` in another terminal), it detects it and only starts the frontend. Other useful commands: ```bash just dev-web # backend + web app (no Tauri/Rust build) just dev-backend # backend only just dev-frontend # Tauri app only (backend must be running) just kill # stop all dev processes just clean-all # nuke everything and start fresh just --list # see all available commands ``` > **Note:** In dev mode, the app connects to a manually-started Python server. > The bundled server binary is only used in production builds. #### Windows Notes The justfile works natively on Windows via PowerShell. No WSL or Git Bash required. On Windows with an NVIDIA GPU, `just setup` automatically installs CUDA-enabled PyTorch for GPU acceleration. ### Model Downloads Models are automatically downloaded from HuggingFace Hub on first use: - **Whisper** (transcription): Auto-downloads on first transcription - **Qwen3-TTS** (voice cloning): Auto-downloads on first generation (~2-4GB) First-time usage will be slower due to model downloads, but subsequent runs will use cached models. ### Building **Build production app:** ```bash just build # Build CPU server binary + Tauri installer ``` On Windows, to build with CUDA support for local testing: ```bash just build-local # Build CPU + CUDA server binaries + Tauri installer ``` This builds the CPU sidecar (bundled with the app), the CUDA binary (placed in `%APPDATA%/com.voicebox.app/backends/` for runtime GPU switching), and the installable Tauri app. Creates platform-specific installers (`.dmg`, `.msi`, `.AppImage`) in `tauri/src-tauri/target/release/bundle/`. **Individual build targets:** ```bash just build-server # CPU server binary only just build-server-cuda # CUDA server binary only (Windows) just build-tauri # Tauri desktop app only just build-web # Web app only ``` **Building with local Qwen3-TTS development version:** If you're actively developing or modifying the Qwen3-TTS library, set the `QWEN_TTS_PATH` environment variable to point to your local clone: ```bash export QWEN_TTS_PATH=~/path/to/your/Qwen3-TTS just build-server ``` This makes PyInstaller use your local qwen-tts version instead of the pip-installed package. ### Generate OpenAPI Client After starting the backend server: ```bash ./scripts/generate-api.sh ``` This downloads the OpenAPI schema and generates the TypeScript client in `app/src/lib/api/` ### Convert Assets to Web Formats To optimize images and videos for the web, run: ```bash bun run convert:assets ``` This script: - Converts PNG → WebP (better compression, same quality) - Converts MOV → WebM (VP9 codec, smaller file size) - Processes files in `landing/public/` and `docs/public/` - **Deletes original files** after successful conversion **Requirements:** Install `webp` and `ffmpeg`: ```bash brew install webp ffmpeg ``` > **Note:** Run this before committing new images or videos to keep the repository size small. ## Development Workflow ### 1. Create a Branch ```bash git checkout -b feature/your-feature-name # or git checkout -b fix/your-bug-fix ``` ### 2. Make Your Changes - Write clean, readable code - Follow existing code style - Add comments for complex logic - Update documentation as needed ### 3. Test Your Changes - Test manually in the app - Ensure backend API endpoints work - Check for TypeScript/Python errors - Verify UI components render correctly ### 4. Commit Your Changes Write clear, descriptive commit messages: ```bash git commit -m "Add feature: voice profile export" git commit -m "Fix: audio playback stops after 30 seconds" ``` ### 5. Push and Create Pull Request ```bash git push origin feature/your-feature-name ``` Then create a pull request on GitHub with: - Clear description of changes - Screenshots (for UI changes) - Reference to related issues ## Code Style ### TypeScript/React - Use TypeScript strict mode - Follow React best practices - Use functional components with hooks - Prefer named exports - Format with Biome (runs automatically) ```typescript // Good export function ProfileCard({ profile }: { profile: Profile }) { return
{profile.name}
; } // Avoid export const ProfileCard = (props) => { ... } ``` ### Python - Follow PEP 8 style guide - Use type hints - Use async/await for I/O operations - Format with Black (if configured) ```python # Good async def create_profile(name: str, language: str) -> Profile: """Create a new voice profile.""" ... # Avoid def create_profile(name, language): ... ``` ### Rust - Follow Rust conventions - Use meaningful variable names - Handle errors explicitly - Format with `rustfmt` ## Project Structure ``` voicebox/ ├── app/ # Shared React frontend │ └── src/ │ ├── components/ # UI components │ ├── lib/ # Utilities and API client │ └── hooks/ # React hooks ├── backend/ # Python FastAPI server │ ├── main.py # API routes │ ├── tts.py # Voice synthesis │ └── ... ├── tauri/ # Desktop app wrapper │ └── src-tauri/ # Rust backend └── scripts/ # Build scripts ``` ## Areas for Contribution ### 🐛 Bug Fixes - Check existing issues for bugs to fix - Test your fix thoroughly - Add tests if possible ### ✨ New Features - Check the roadmap in README.md - Discuss major features in an issue first - Keep features focused and well-scoped ### 📚 Documentation - Improve README clarity - Add code comments - Write API documentation - Create tutorials or guides ### 🎨 UI/UX Improvements - Improve accessibility - Enhance visual design - Optimize performance - Add animations/transitions ### 🔧 Infrastructure - Improve build process - Add CI/CD improvements - Optimize bundle size - Add testing infrastructure ## API Development When adding new API endpoints: 1. **Add route in `backend/main.py`** 2. **Create Pydantic models in `backend/models.py`** 3. **Implement business logic in appropriate module** 4. **Update OpenAPI schema** (automatic with FastAPI) 5. **Regenerate TypeScript client:** ```bash bun run generate:api ``` 6. **Update `backend/README.md`** with endpoint documentation ## Testing Currently, testing is primarily manual. When adding tests: - **Backend**: Use pytest for Python tests - **Frontend**: Use Vitest for React component tests - **E2E**: Use Playwright for end-to-end tests (future) ## Pull Request Process 1. **Update documentation** if needed 2. **Ensure code follows style guidelines** 3. **Test your changes thoroughly** 4. **Update CHANGELOG.md** with your changes 5. **Request review** from maintainers ### PR Checklist - [ ] Code follows style guidelines - [ ] Documentation updated - [ ] Changes tested - [ ] No breaking changes (or documented) - [ ] CHANGELOG.md updated ## Release Process Releases are managed by maintainers: 1. **Bump version using bumpversion:** ```bash # Install bumpversion (if not already installed) pip install bumpversion # Bump patch version (0.1.0 -> 0.1.1) bumpversion patch # Or bump minor version (0.1.0 -> 0.2.0) bumpversion minor # Or bump major version (0.1.0 -> 1.0.0) bumpversion major ``` This automatically: - Updates version numbers in all files (`tauri.conf.json`, `Cargo.toml`, all `package.json` files, `backend/main.py`) - Creates a git commit with the version bump - Creates a git tag (e.g., `v0.1.1`, `v0.2.0`) 2. **Update CHANGELOG.md** with release notes 3. **Push commits and tags:** ```bash git push git push --tags ``` 4. **GitHub Actions builds and releases** automatically when tags are pushed ## Troubleshooting See [docs/TROUBLESHOOTING.md](docs/TROUBLESHOOTING.md) for common issues and solutions. **Quick fixes:** - **Backend won't start:** Check Python version (3.11+), ensure venv is activated, install dependencies - **Tauri build fails:** Ensure Rust is installed, clean build with `cd tauri/src-tauri && cargo clean` - **OpenAPI client generation fails:** Ensure backend is running, check `curl http://localhost:17493/openapi.json` ## Questions? - Open an issue for bugs or feature requests - Check existing issues and discussions - Review the codebase to understand patterns - See [docs/TROUBLESHOOTING.md](docs/TROUBLESHOOTING.md) for common issues ## Additional Resources - [README.md](README.md) - Project overview - [backend/README.md](backend/README.md) - API documentation - [docs/AUTOUPDATER_QUICKSTART.md](docs/AUTOUPDATER_QUICKSTART.md) - Auto-updater setup - [SECURITY.md](SECURITY.md) - Security policy - [CHANGELOG.md](CHANGELOG.md) - Version history ## License By contributing, you agree that your contributions will be licensed under the MIT License. --- Thank you for contributing to Voicebox! 🎉 ================================================ FILE: Dockerfile ================================================ # ============================================================ # Voicebox — Local TTS Server with Web UI (CPU) # 3-stage build: Frontend → Python deps → Runtime # ============================================================ # === Stage 1: Build frontend === FROM oven/bun:1 AS frontend WORKDIR /build # Copy workspace config and frontend source COPY package.json bun.lock ./ COPY app/ ./app/ COPY web/ ./web/ # Strip workspaces not needed for web build, and fix trailing comma RUN sed -i '/"tauri"/d; /"landing"/d' package.json && \ sed -i -z 's/,\n ]/\n ]/' package.json RUN bun install --no-save # Build frontend (skip tsc — upstream has pre-existing type errors) RUN cd web && bunx --bun vite build # === Stage 2: Build Python dependencies === FROM python:3.11-slim AS backend-builder WORKDIR /build RUN apt-get update && apt-get install -y --no-install-recommends \ git \ build-essential \ && rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip COPY backend/requirements.txt . RUN pip install --no-cache-dir --prefix=/install -r requirements.txt RUN pip install --no-cache-dir --prefix=/install --no-deps chatterbox-tts RUN pip install --no-cache-dir --prefix=/install --no-deps hume-tada RUN pip install --no-cache-dir --prefix=/install \ git+https://github.com/QwenLM/Qwen3-TTS.git # === Stage 3: Runtime === FROM python:3.11-slim # Create non-root user for security RUN groupadd -r voicebox && \ useradd -r -g voicebox -m -s /bin/bash voicebox WORKDIR /app # Install only runtime system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ ffmpeg \ curl \ && rm -rf /var/lib/apt/lists/* # Copy installed Python packages from builder stage COPY --from=backend-builder /install /usr/local # Copy backend application code COPY --chown=voicebox:voicebox backend/ /app/backend/ # Copy built frontend from frontend stage COPY --from=frontend --chown=voicebox:voicebox /build/web/dist /app/frontend/ # Create data directories owned by non-root user RUN mkdir -p /app/data/generations /app/data/profiles /app/data/cache \ && chown -R voicebox:voicebox /app/data # Switch to non-root user USER voicebox # Expose the API port EXPOSE 17493 # Health check — auto-restart if the server hangs HEALTHCHECK --interval=30s --timeout=10s --retries=3 --start-period=60s \ CMD curl -f http://localhost:17493/health || exit 1 # Start the FastAPI server CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "17493"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2026 Voicebox Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

Voicebox

Voicebox

The open-source voice synthesis studio.
Clone voices. Generate speech. Apply effects. Build voice-powered apps.
All running locally on your machine.

Downloads Release Stars License

voicebox.shDocsDownloadFeaturesAPI


Voicebox App Screenshot

Click the image above to watch the demo video on voicebox.sh


Voicebox Screenshot 2

Voicebox Screenshot 3


## What is Voicebox? Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio, generate speech in 23 languages across 5 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. - **Complete privacy** — models and voice data stay on your machine - **5 TTS engines** — Qwen3-TTS, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, and HumeAI TADA - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo - **Unlimited length** — auto-chunking with crossfade for scripts, articles, and chapters - **Stories editor** — multi-track timeline for conversations, podcasts, and narratives - **API-first** — REST API for integrating voice synthesis into your own projects - **Native performance** — built with Tauri (Rust), not Electron - **Runs everywhere** — macOS (MLX/Metal), Windows (CUDA), Linux, AMD ROCm, Intel Arc, Docker --- ## Download | Platform | Download | | --------------------- | ------------------------------------------------------ | | macOS (Apple Silicon) | [Download DMG](https://voicebox.sh/download/mac-arm) | | macOS (Intel) | [Download DMG](https://voicebox.sh/download/mac-intel) | | Windows | [Download MSI](https://voicebox.sh/download/windows) | | Docker | `docker compose up` | > **[View all binaries →](https://github.com/jamiepine/voicebox/releases/latest)** > **Linux** — Pre-built binaries are not yet available. See [voicebox.sh/linux-install](https://voicebox.sh/linux-install) for build-from-source instructions. --- ## Features ### Multi-Engine Voice Cloning Five TTS engines with different strengths, switchable per-generation: | Engine | Languages | Strengths | | --------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------------------------- | | **Qwen3-TTS** (0.6B / 1.7B) | 10 | High-quality multilingual cloning, delivery instructions ("speak slowly", "whisper") | | **LuxTTS** | English | Lightweight (~1GB VRAM), 48kHz output, 150x realtime on CPU | | **Chatterbox Multilingual** | 23 | Broadest language coverage — Arabic, Danish, Finnish, Greek, Hebrew, Hindi, Malay, Norwegian, Polish, Swahili, Swedish, Turkish and more | | **Chatterbox Turbo** | English | Fast 350M model with paralinguistic emotion/sound tags | | **TADA** (1B / 3B) | 10 | HumeAI speech-language model — 700s+ coherent audio, text-acoustic dual alignment | ### Emotions & Paralinguistic Tags Type `/` in the text input to insert expressive tags that the model synthesizes inline with speech (Chatterbox Turbo): `[laugh]` `[chuckle]` `[gasp]` `[cough]` `[sigh]` `[groan]` `[sniff]` `[shush]` `[clear throat]` ### Post-Processing Effects 8 audio effects powered by Spotify's `pedalboard` library. Apply after generation, preview in real time, build reusable presets. | Effect | Description | | ---------------- | --------------------------------------------- | | Pitch Shift | Up or down by up to 12 semitones | | Reverb | Configurable room size, damping, wet/dry mix | | Delay | Echo with adjustable time, feedback, and mix | | Chorus / Flanger | Modulated delay for metallic or lush textures | | Compressor | Dynamic range compression | | Gain | Volume adjustment (-40 to +40 dB) | | High-Pass Filter | Remove low frequencies | | Low-Pass Filter | Remove high frequencies | Ships with 4 built-in presets (Robotic, Radio, Echo Chamber, Deep Voice) and supports custom presets. Effects can be assigned per-profile as defaults. ### Unlimited Generation Length Text is automatically split at sentence boundaries and each chunk is generated independently, then crossfaded together. Works with all engines. - Configurable auto-chunking limit (100–5,000 chars) - Crossfade slider (0–200ms) for smooth transitions - Max text length: 50,000 characters - Smart splitting respects abbreviations, CJK punctuation, and `[tags]` ### Generation Versions Every generation supports multiple versions with provenance tracking: - **Original** — clean TTS output, always preserved - **Effects versions** — apply different effects chains from any source version - **Takes** — regenerate with a new seed for variation - **Source tracking** — each version records its lineage - **Favorites** — star generations for quick access ### Async Generation Queue Generation is non-blocking. Submit and immediately start typing the next one. - Serial execution queue prevents GPU contention - Real-time SSE status streaming - Failed generations can be retried - Stale generations from crashes auto-recover on startup ### Voice Profile Management - Create profiles from audio files or record directly in-app - Import/export profiles to share or back up - Multi-sample support for higher quality cloning - Per-profile default effects chains - Organize with descriptions and language tags ### Stories Editor Multi-voice timeline editor for conversations, podcasts, and narratives. - Multi-track composition with drag-and-drop - Inline audio trimming and splitting - Auto-playback with synchronized playhead - Version pinning per track clip ### Recording & Transcription - In-app recording with waveform visualization - System audio capture (macOS and Windows) - Automatic transcription powered by Whisper (including Whisper Turbo) - Export recordings in multiple formats ### Model Management - Per-model unload to free GPU memory without deleting downloads - Custom models directory via `VOICEBOX_MODELS_DIR` - Model folder migration with progress tracking - Download cancel/clear UI ### GPU Support | Platform | Backend | Notes | | ------------------------ | -------------- | ---------------------------------------------- | | macOS (Apple Silicon) | MLX (Metal) | 4-5x faster via Neural Engine | | Windows / Linux (NVIDIA) | PyTorch (CUDA) | Auto-downloads CUDA binary from within the app | | Linux (AMD) | PyTorch (ROCm) | Auto-configures HSA_OVERRIDE_GFX_VERSION | | Windows (any GPU) | DirectML | Universal Windows GPU support | | Intel Arc | IPEX/XPU | Intel discrete GPU acceleration | | Any | CPU | Works everywhere, just slower | --- ## API Voicebox exposes a full REST API for integrating voice synthesis into your own apps. ```bash # Generate speech curl -X POST http://localhost:17493/generate \ -H "Content-Type: application/json" \ -d '{"text": "Hello world", "profile_id": "abc123", "language": "en"}' # List voice profiles curl http://localhost:17493/profiles # Create a profile curl -X POST http://localhost:17493/profiles \ -H "Content-Type: application/json" \ -d '{"name": "My Voice", "language": "en"}' ``` **Use cases:** game dialogue, podcast production, accessibility tools, voice assistants, content automation. Full API documentation available at `http://localhost:17493/docs`. --- ## Tech Stack | Layer | Technology | | ------------- | ------------------------------------------------- | | Desktop App | Tauri (Rust) | | Frontend | React, TypeScript, Tailwind CSS | | State | Zustand, React Query | | Backend | FastAPI (Python) | | TTS Engines | Qwen3-TTS, LuxTTS, Chatterbox, Chatterbox Turbo, TADA | | Effects | Pedalboard (Spotify) | | Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | | Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | | Database | SQLite | | Audio | WaveSurfer.js, librosa | --- ## Roadmap | Feature | Description | | ----------------------- | ---------------------------------------------- | | **Real-time Streaming** | Stream audio as it generates, word by word | | **Voice Design** | Create new voices from text descriptions | | **More Models** | XTTS, Bark, and other open-source voice models | | **Plugin Architecture** | Extend with custom models and effects | | **Mobile Companion** | Control Voicebox from your phone | --- ## Development See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed setup and contribution guidelines. ### Quick Start ```bash git clone https://github.com/jamiepine/voicebox.git cd voicebox just setup # creates Python venv, installs all deps just dev # starts backend + desktop app ``` Install [just](https://github.com/casey/just): `brew install just` or `cargo install just`. Run `just --list` to see all commands. **Prerequisites:** [Bun](https://bun.sh), [Rust](https://rustup.rs), [Python 3.11+](https://python.org), [Tauri Prerequisites](https://v2.tauri.app/start/prerequisites/), and [Xcode](https://developer.apple.com/xcode/) on macOS. ### Building Locally ```bash just build # Build CPU server binary + Tauri app just build-local # (Windows) Build CPU + CUDA server binaries + Tauri app ``` ### Adding New Voice Models The multi-engine architecture makes adding new TTS engines straightforward. A [step-by-step guide](docs/content/docs/developer/tts-engines.mdx) covers the full process: dependency research, backend protocol implementation, frontend wiring, and PyInstaller bundling. The guide is optimized for AI coding agents. An [agent skill](.agents/skills/add-tts-engine/SKILL.md) can pick up a model name and handle the entire integration autonomously — you just test the build locally. ### Project Structure ``` voicebox/ ├── app/ # Shared React frontend ├── tauri/ # Desktop app (Tauri + Rust) ├── web/ # Web deployment ├── backend/ # Python FastAPI server ├── landing/ # Marketing website └── scripts/ # Build & release scripts ``` --- ## Contributing Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. 1. Fork the repo 2. Create a feature branch 3. Make your changes 4. Submit a PR ## Security Found a security vulnerability? Please report it responsibly. See [SECURITY.md](SECURITY.md) for details. --- ## License MIT License — see [LICENSE](LICENSE) for details. ---

voicebox.sh

================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions We release patches for security vulnerabilities. Which versions are eligible for receiving such patches depends on the CVSS v3.0 Rating: | Version | Supported | | ------- | ------------------ | | 0.1.x | :white_check_mark: | | < 0.1 | :x: | ## Reporting a Vulnerability If you discover a security vulnerability, please report it responsibly: 1. **Do not** open a public GitHub issue 2. Email security details to: [security@voicebox.sh](mailto:security@voicebox.sh) 3. Include: - Description of the vulnerability - Steps to reproduce - Potential impact - Suggested fix (if any) We will: - Acknowledge receipt within 48 hours - Provide a timeline for addressing the issue - Keep you informed of progress - Credit you in the security advisory (if desired) ## Security Best Practices ### For Users - **Keep Voicebox updated** - Updates include security patches - **Verify downloads** - Only download from official releases - **Local processing** - Voice data stays on your machine - **Network security** - Use HTTPS when connecting to remote servers ### For Developers - **Dependencies** - Keep all dependencies up to date - **Code review** - All PRs require review before merging - **Secrets** - Never commit API keys or signing keys - **Signing** - All releases are cryptographically signed ## Known Security Considerations ### Local Processing Voicebox processes all audio locally by default. Your voice data never leaves your machine unless you explicitly enable remote server mode. ### Remote Server Mode When connecting to a remote server: - Ensure the server is on a trusted network - Use HTTPS for remote connections - Verify server identity before connecting ### Auto-Updates - Updates are cryptographically signed - Signature verification happens before installation - Only HTTPS endpoints are allowed ### Python Server The embedded Python server: - Runs locally by default (localhost only) - Can be configured for remote access - Uses standard FastAPI security practices ## Disclosure Timeline - **Day 0**: Vulnerability reported - **Day 1-2**: Initial assessment and acknowledgment - **Day 3-7**: Investigation and fix development - **Day 8-14**: Testing and release preparation - **Day 15+**: Public disclosure (if applicable) Timeline may vary based on severity and complexity. ## Security Updates Security updates will be: - Released as patch versions (e.g., 0.1.1) - Documented in CHANGELOG.md - Announced via GitHub releases - Automatically delivered via auto-updater --- Thank you for helping keep Voicebox secure! 🔒 ================================================ FILE: app/components.json ================================================ { "$schema": "https://ui.shadcn.com/schema.json", "style": "new-york", "rsc": false, "tsx": true, "tailwind": { "config": "tailwind.config.js", "css": "src/index.css", "baseColor": "slate", "cssVariables": true, "prefix": "" }, "aliases": { "components": "@/components", "utils": "@/lib/utils", "ui": "@/components/ui", "lib": "@/lib", "hooks": "@/lib/hooks" } } ================================================ FILE: app/index.html ================================================ voicebox
================================================ FILE: app/package.json ================================================ { "name": "@voicebox/app", "version": "0.3.1", "private": true, "type": "module", "scripts": { "dev": "vite", "build": "vite build", "preview": "vite preview", "lint": "biome lint src", "lint:fix": "biome lint --write src", "format": "biome format --write src", "check": "biome check --write src" }, "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", "@dnd-kit/utilities": "^3.2.2", "@hookform/resolvers": "^3.9.0", "@radix-ui/react-alert-dialog": "^1.1.1", "@radix-ui/react-avatar": "^1.1.0", "@radix-ui/react-dialog": "^1.1.1", "@radix-ui/react-dropdown-menu": "^2.1.1", "@radix-ui/react-label": "^2.1.0", "@radix-ui/react-popover": "^1.1.1", "@radix-ui/react-progress": "^1.1.0", "@radix-ui/react-scroll-area": "^1.1.0", "@radix-ui/react-select": "^2.1.1", "@radix-ui/react-separator": "^1.1.0", "@radix-ui/react-slider": "^1.3.6", "@radix-ui/react-slot": "^1.1.0", "@radix-ui/react-tabs": "^1.1.0", "@radix-ui/react-toast": "^1.2.1", "@tanstack/react-query": "^5.0.0", "@tanstack/react-query-devtools": "^5.0.0", "@tanstack/react-router": "^1.157.16", "@tauri-apps/api": "^2.0.0", "@tauri-apps/plugin-dialog": "^2.0.0", "@tauri-apps/plugin-fs": "^2.0.0", "@tauri-apps/plugin-process": "^2.3.1", "@tauri-apps/plugin-updater": "^2.9.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.1", "date-fns": "^3.6.0", "framer-motion": "^12.29.0", "lucide-react": "^0.454.0", "motion": "^12.29.0", "react": "^18.3.0", "react-dom": "^18.3.0", "react-hook-form": "^7.53.0", "react-sound-visualizer": "^1.4.0", "tailwind-merge": "^2.5.4", "wavesurfer.js": "^7.0.0", "zod": "^3.23.8", "zustand": "^4.5.0" }, "devDependencies": { "@tailwindcss/vite": "^4.1.18", "@types/react": "^18.3.0", "@types/react-dom": "^18.3.0", "@vitejs/plugin-react": "^4.3.0", "tailwindcss": "^4.1.0", "typescript": "^5.6.0", "vite": "^5.4.0" } } ================================================ FILE: app/plugins/changelog.ts ================================================ import { readFileSync } from 'node:fs'; import path from 'node:path'; import type { Plugin } from 'vite'; /** Vite plugin that exposes CHANGELOG.md as `virtual:changelog`. */ export function changelogPlugin(repoRoot: string): Plugin { const virtualId = 'virtual:changelog'; const resolvedId = '\0' + virtualId; const changelogPath = path.resolve(repoRoot, 'CHANGELOG.md'); return { name: 'changelog', resolveId(id) { if (id === virtualId) return resolvedId; }, load(id) { if (id === resolvedId) { const raw = readFileSync(changelogPath, 'utf-8'); return `export default ${JSON.stringify(raw)};`; } }, }; } ================================================ FILE: app/src/App.tsx ================================================ import { RouterProvider } from '@tanstack/react-router'; import { useEffect, useRef, useState } from 'react'; import voiceboxLogo from '@/assets/voicebox-logo.png'; import ShinyText from '@/components/ShinyText'; import { TitleBarDragRegion } from '@/components/TitleBarDragRegion'; import { useAutoUpdater } from '@/hooks/useAutoUpdater'; import { TOP_SAFE_AREA_PADDING } from '@/lib/constants/ui'; import { cn } from '@/lib/utils/cn'; import { usePlatform } from '@/platform/PlatformContext'; import { router } from '@/router'; import { useLogStore } from '@/stores/logStore'; import { useServerStore } from '@/stores/serverStore'; const LOADING_MESSAGES = [ 'Warming up tensors...', 'Calibrating synthesizer engine...', 'Initializing voice models...', 'Loading neural networks...', 'Preparing audio pipelines...', 'Optimizing waveform generators...', 'Tuning frequency analyzers...', 'Building voice embeddings...', 'Configuring text-to-speech cores...', 'Syncing audio buffers...', 'Establishing model connections...', 'Preprocessing training data...', 'Validating voice samples...', 'Compiling inference engines...', 'Mapping phoneme sequences...', 'Aligning prosody parameters...', 'Activating speech synthesis...', 'Fine-tuning acoustic models...', 'Preparing voice cloning matrices...', 'Initializing Qwen TTS framework...', ]; function App() { const platform = usePlatform(); const [serverReady, setServerReady] = useState(false); const [loadingMessageIndex, setLoadingMessageIndex] = useState(0); const serverStartingRef = useRef(false); // Automatically check for app updates on startup and show toast notifications useAutoUpdater({ checkOnMount: true, showToast: true }); // Sync stored setting to Rust on startup useEffect(() => { if (platform.metadata.isTauri) { const keepRunning = useServerStore.getState().keepServerRunningOnClose; platform.lifecycle.setKeepServerRunning(keepRunning).catch((error) => { console.error('Failed to sync initial setting to Rust:', error); }); } // Empty dependency array - platform is stable from context, only run once // eslint-disable-next-line react-hooks/exhaustive-deps }, [platform.metadata.isTauri, platform.lifecycle]); // Setup lifecycle callbacks useEffect(() => { platform.lifecycle.onServerReady = () => { setServerReady(true); }; // Empty dependency array - platform is stable from context, only run once // eslint-disable-next-line react-hooks/exhaustive-deps }, [platform.lifecycle]); // Subscribe to server logs useEffect(() => { const unsubscribe = platform.lifecycle.subscribeToServerLogs((entry) => { useLogStore.getState().addEntry(entry); }); return unsubscribe; }, [platform.lifecycle]); // Setup window close handler and auto-start server when running in Tauri (production only) useEffect(() => { if (!platform.metadata.isTauri) { setServerReady(true); // Web assumes server is running return; } // Setup window close handler to check setting and stop server if needed // This works in both dev and prod, but will only stop server if it was started by the app platform.lifecycle.setupWindowCloseHandler().catch((error) => { console.error('Failed to setup window close handler:', error); }); // Only auto-start server in production mode // In dev mode, user runs server separately if (!import.meta.env?.PROD) { console.log('Dev mode: Skipping auto-start of server (run it separately)'); setServerReady(true); // Mark as ready so UI doesn't show loading screen // Mark that server was not started by app (so we don't try to stop it on close) // @ts-expect-error - adding property to window window.__voiceboxServerStartedByApp = false; return; } // Auto-start server in production if (serverStartingRef.current) { return; } serverStartingRef.current = true; const isRemote = useServerStore.getState().mode === 'remote'; const customModelsDir = useServerStore.getState().customModelsDir; console.log(`Production mode: Starting bundled server... (remote: ${isRemote})`); platform.lifecycle .startServer(isRemote, customModelsDir) .then((serverUrl) => { console.log('Server is ready at:', serverUrl); // Update the server URL in the store with the dynamically assigned port useServerStore.getState().setServerUrl(serverUrl); setServerReady(true); // Mark that we started the server (so we know to stop it on close) // @ts-expect-error - adding property to window window.__voiceboxServerStartedByApp = true; }) .catch((error) => { console.error('Failed to auto-start server:', error); serverStartingRef.current = false; // @ts-expect-error - adding property to window window.__voiceboxServerStartedByApp = false; }); // Cleanup: stop server on actual unmount (not StrictMode remount) // Note: Window close is handled separately in Tauri Rust code return () => { // Window close event handles server shutdown based on setting serverStartingRef.current = false; }; // Empty dependency array - platform is stable from context, only run once // eslint-disable-next-line react-hooks/exhaustive-deps }, [platform.metadata.isTauri, platform.lifecycle]); // Cycle through loading messages every 3 seconds useEffect(() => { if (!platform.metadata.isTauri || serverReady) { return; } const interval = setInterval(() => { setLoadingMessageIndex((prev) => (prev + 1) % LOADING_MESSAGES.length); }, 3000); return () => clearInterval(interval); }, [serverReady, platform.metadata.isTauri]); // Show loading screen while server is starting in Tauri if (platform.metadata.isTauri && !serverReady) { return (
Voicebox
); } return ; } export default App; ================================================ FILE: app/src/components/AppFrame/AppFrame.tsx ================================================ import { useRouterState } from '@tanstack/react-router'; import { TitleBarDragRegion } from '@/components/TitleBarDragRegion'; import { AudioPlayer } from '@/components/AudioPlayer/AudioPlayer'; import { StoryTrackEditor } from '@/components/StoriesTab/StoryTrackEditor'; import { TOP_SAFE_AREA_PADDING } from '@/lib/constants/ui'; import { cn } from '@/lib/utils/cn'; import { useStoryStore } from '@/stores/storyStore'; import { useStory } from '@/lib/hooks/useStories'; interface AppFrameProps { children: React.ReactNode; } export function AppFrame({ children }: AppFrameProps) { const routerState = useRouterState(); const isStoriesRoute = routerState.location.pathname === '/stories'; const selectedStoryId = useStoryStore((state) => state.selectedStoryId); const { data: story } = useStory(selectedStoryId); // Show track editor when on stories route with a selected story that has items const showTrackEditor = isStoriesRoute && selectedStoryId && story && story.items.length > 0; return (
{children} {showTrackEditor ? ( ) : ( )}
); } ================================================ FILE: app/src/components/AudioPlayer/AudioPlayer.tsx ================================================ import { useQuery } from '@tanstack/react-query'; import { Pause, Play, Repeat, Volume2, VolumeX, X } from 'lucide-react'; import { useEffect, useId, useMemo, useRef, useState } from 'react'; import WaveSurfer from 'wavesurfer.js'; import { Button } from '@/components/ui/button'; import { Slider } from '@/components/ui/slider'; import { apiClient } from '@/lib/api/client'; import { formatAudioDuration } from '@/lib/utils/audio'; import { debug } from '@/lib/utils/debug'; import { usePlatform } from '@/platform/PlatformContext'; import { usePlayerStore } from '@/stores/playerStore'; export function AudioPlayer() { const platform = usePlatform(); const volumeLabelId = useId(); const { audioUrl, audioId, profileId, isPlaying, currentTime, duration, volume, isLooping, shouldRestart, setIsPlaying, setCurrentTime, setDuration, setVolume, toggleLoop, clearRestartFlag, reset, } = usePlayerStore(); // Check if profile has assigned channels (for native audio routing) const { data: profileChannels } = useQuery({ queryKey: ['profile-channels', profileId], queryFn: () => { if (!profileId) return { channel_ids: [] }; return apiClient.getProfileChannels(profileId); }, enabled: !!profileId && platform.metadata.isTauri, }); const { data: channels } = useQuery({ queryKey: ['channels'], queryFn: () => apiClient.listChannels(), enabled: !!profileChannels && profileChannels.channel_ids.length > 0, }); // Determine if we should use native playback const useNativePlayback = useMemo(() => { if (!platform.metadata.isTauri || !profileChannels || !channels) { return false; } const assignedChannels = channels.filter((ch) => profileChannels.channel_ids.includes(ch.id)); // Use native playback if any assigned channel has non-default devices const shouldUseNative = assignedChannels.some( (ch) => ch.device_ids.length > 0 && !ch.is_default, ); return shouldUseNative; }, [profileChannels, channels, platform.metadata.isTauri]); const waveformRef = useRef(null); const wavesurferRef = useRef(null); const loadingRef = useRef(false); const previousAudioIdRef = useRef(null); const hasInitializedRef = useRef(false); const isUsingNativePlaybackRef = useRef(false); const [isLoading, setIsLoading] = useState(false); const [error, setError] = useState(null); const [wsReady, setWsReady] = useState(false); // Create WaveSurfer once when the player becomes visible (audioUrl is set). // This instance is reused for all subsequent audio loads - never destroyed until unmount. useEffect(() => { if (!audioUrl) return; if (wavesurferRef.current) return; // already created const initWaveSurfer = () => { const container = waveformRef.current; if (!container) { setTimeout(initWaveSurfer, 50); return; } const rect = container.getBoundingClientRect(); const style = window.getComputedStyle(container); const isVisible = rect.width > 0 && rect.height > 0 && style.display !== 'none' && style.visibility !== 'hidden'; if (!isVisible) { setTimeout(initWaveSurfer, 50); return; } debug.log('Creating WaveSurfer instance', { width: rect.width, height: rect.height, }); try { const root = document.documentElement; const getCSSVar = (varName: string) => { const value = getComputedStyle(root).getPropertyValue(varName).trim(); return value ? `hsl(${value})` : ''; }; const wavesurfer = WaveSurfer.create({ container, waveColor: getCSSVar('--muted'), progressColor: getCSSVar('--accent'), cursorColor: getCSSVar('--accent'), cursorWidth: 3, barWidth: 2, barRadius: 2, height: 80, normalize: true, interact: true, dragToSeek: { debounceTime: 0 }, mediaControls: false, backend: 'WebAudio', }); // Wire up event handlers (these persist for the lifetime of the instance) wavesurfer.on('timeupdate', (time) => { const dur = usePlayerStore.getState().duration; if (dur > 0 && time >= dur) { setCurrentTime(dur); const loop = usePlayerStore.getState().isLooping; if (loop) { wavesurfer.seekTo(0); wavesurfer.play().catch((err) => debug.error('Loop play failed:', err)); } else { wavesurfer.pause(); setIsPlaying(false); } return; } setCurrentTime(time); }); wavesurfer.on('ready', () => { const dur = wavesurfer.getDuration(); setDuration(dur); loadingRef.current = false; setIsLoading(false); setError(null); debug.log('Audio ready, duration:', dur); wavesurfer.setVolume(usePlayerStore.getState().volume); wavesurfer.setMuted(false); // Auto-play if the flag is set (story mode advance or explicit play) const shouldAutoPlayNow = usePlayerStore.getState().shouldAutoPlay; if (shouldAutoPlayNow) { usePlayerStore.getState().clearAutoPlayFlag(); wavesurfer.play().catch((err) => { debug.error('Failed to autoplay:', err); }); } else { debug.log('Skipping auto-play - shouldAutoPlay is false'); } }); wavesurfer.on('play', () => setIsPlaying(true)); wavesurfer.on('pause', () => { setIsPlaying(false); setCurrentTime(wavesurfer.getCurrentTime()); }); wavesurfer.on('seeking', (time) => setCurrentTime(time)); // Mute audio during drag-to-seek to prevent popping from the WebAudio // backend's hard stop/start cycle on each seek. Unmute with a short // fade-in when the drag ends. const seekMedia = wavesurfer.getMediaElement() as any; const seekGain: GainNode | null = seekMedia?.getGainNode?.() ?? null; if (seekGain) { const ctx = seekGain.context as AudioContext; wavesurfer.on('dragstart', () => { seekGain.gain.cancelScheduledValues(ctx.currentTime); seekGain.gain.setTargetAtTime(0, ctx.currentTime, 0.002); }); wavesurfer.on('dragend', () => { seekGain.gain.cancelScheduledValues(ctx.currentTime); seekGain.gain.setTargetAtTime(1, ctx.currentTime, 0.01); }); } wavesurfer.on('finish', () => { const loop = usePlayerStore.getState().isLooping; if (loop) { wavesurfer.seekTo(0); wavesurfer.play().catch((err) => debug.error('Loop play failed:', err)); } else { setIsPlaying(false); const onFinish = usePlayerStore.getState().onFinish; if (onFinish) onFinish(); } }); wavesurfer.on('error', (err) => { debug.error('WaveSurfer error:', err); setIsLoading(false); setError(`Audio error: ${err instanceof Error ? err.message : String(err)}`); }); wavesurfer.on('loading', (percent) => { setIsLoading(true); if (percent === 100) setIsLoading(false); }); wavesurferRef.current = wavesurfer; setWsReady(true); debug.log('WaveSurfer created successfully'); } catch (err) { debug.error('Failed to create WaveSurfer:', err); setError( `Failed to initialize waveform: ${err instanceof Error ? err.message : String(err)}`, ); } }; let rafId: number; rafId = requestAnimationFrame(() => { initWaveSurfer(); }); return () => { cancelAnimationFrame(rafId); }; // Only run on mount-like conditions. audioUrl is here so we create the instance // when the player first appears, but we guard against re-creation above. // eslint-disable-next-line react-hooks/exhaustive-deps }, [audioUrl, setIsPlaying, setDuration, setCurrentTime]); // Destroy WaveSurfer only on unmount useEffect(() => { return () => { if (wavesurferRef.current) { debug.log('Destroying WaveSurfer instance (unmount)'); try { wavesurferRef.current.destroy(); } catch (err) { debug.error('Error destroying WaveSurfer:', err); } wavesurferRef.current = null; setWsReady(false); } }; }, []); // Load audio when URL changes (reuses the existing WaveSurfer instance) useEffect(() => { const wavesurfer = wavesurferRef.current; if (!wavesurfer || !wsReady) return; if (!audioUrl) { // No audio - pause and reset wavesurfer.pause(); wavesurfer.seekTo(0); loadingRef.current = false; setIsLoading(false); setDuration(0); setCurrentTime(0); setError(null); isUsingNativePlaybackRef.current = false; return; } // Reset native playback state isUsingNativePlaybackRef.current = false; wavesurfer.setMuted(false); wavesurfer.setVolume(usePlayerStore.getState().volume); // Stop current playback and reset position before loading new audio. // With the WebAudio backend, pause() accumulates playedDuration internally. // seekTo(0) resets it so the new track starts from the beginning. debug.log('Loading new audio URL:', audioUrl); try { if (wavesurfer.isPlaying()) { wavesurfer.pause(); } wavesurfer.seekTo(0); } catch (err) { debug.error('Error resetting before load:', err); } loadingRef.current = true; setIsLoading(true); setError(null); setCurrentTime(0); setDuration(0); wavesurfer .load(audioUrl) .then(() => { debug.log('Audio loaded into WaveSurfer'); loadingRef.current = false; }) .catch((err) => { debug.error('Failed to load audio:', err); loadingRef.current = false; setIsLoading(false); setError(`Failed to load audio: ${err instanceof Error ? err.message : String(err)}`); }); }, [audioUrl, wsReady, setCurrentTime, setDuration]); // Sync play/pause state (only when user clicks play/pause button, not auto-sync) // This effect is kept for external state changes but should be minimal useEffect(() => { if (!wavesurferRef.current || duration === 0) return; if (isPlaying && wavesurferRef.current.isPlaying() === false) { wavesurferRef.current.play().catch((error) => { debug.error('Failed to play:', error); setIsPlaying(false); setError(`Playback error: ${error instanceof Error ? error.message : String(error)}`); }); } else if (!isPlaying && wavesurferRef.current.isPlaying()) { wavesurferRef.current.pause(); } }, [isPlaying, setIsPlaying, duration]); // Sync volume useEffect(() => { if (wavesurferRef.current) { wavesurferRef.current.setVolume(volume); } }, [volume]); // Mark as initialized when audio is ready, reset when audioId changes useEffect(() => { if (duration > 0 && audioId) { hasInitializedRef.current = true; } // Reset initialization flag when audioId changes to a new audio if (audioId !== previousAudioIdRef.current && previousAudioIdRef.current !== null) { hasInitializedRef.current = false; } if (audioId !== null) { previousAudioIdRef.current = audioId; } }, [duration, audioId]); // Handle restart flag - when history item is clicked again, restart from beginning useEffect(() => { const wavesurfer = wavesurferRef.current; if (!wavesurfer || !shouldRestart || duration === 0) { return; } debug.log('Restarting current audio from beginning'); wavesurfer.seekTo(0); wavesurfer.play().catch((error) => { debug.error('Failed to play after restart:', error); setIsPlaying(false); setError(`Playback error: ${error instanceof Error ? error.message : String(error)}`); }); clearRestartFlag(); }, [shouldRestart, duration, setIsPlaying, clearRestartFlag]); // Auto-play is handled exclusively in the WaveSurfer 'ready' event handler. // A separate effect here would race with the ready event since the WebAudio // backend needs to fully decode the audio before play() works correctly. // Spacebar to play/pause (capture phase so it fires before focused elements) useEffect(() => { const onKeyDown = (e: KeyboardEvent) => { if (e.code !== 'Space') return; // Ignore if user is typing in an input/textarea const tag = (e.target as HTMLElement)?.tagName; if (tag === 'INPUT' || tag === 'TEXTAREA' || (e.target as HTMLElement)?.isContentEditable) { return; } if (audioUrl && duration > 0 && wavesurferRef.current) { e.preventDefault(); e.stopPropagation(); if (wavesurferRef.current.isPlaying()) { wavesurferRef.current.pause(); } else { wavesurferRef.current.play().catch((err) => debug.error('Spacebar play failed:', err)); } } }; document.addEventListener('keydown', onKeyDown, true); return () => document.removeEventListener('keydown', onKeyDown, true); }, [audioUrl, duration]); const handlePlayPause = async () => { // Standard WaveSurfer playback (works for both normal and native playback modes) // When using native playback, WaveSurfer is muted but still controls visualization if (!wavesurferRef.current) { debug.error('WaveSurfer not initialized'); return; } // Check if audio is loaded if (duration === 0 && !isLoading) { debug.error('Audio not loaded yet'); setError('Audio not loaded. Please wait...'); return; } // If using native playback if (useNativePlayback && audioUrl && profileChannels && channels) { if (isPlaying) { // Pause: stop native playback and pause WaveSurfer visualization try { platform.audio.stopPlayback(); debug.log('Stopped native audio playback'); } catch (error) { debug.error('Failed to stop native playback:', error); } wavesurferRef.current.pause(); return; } // Play: trigger native playback try { // Stop any existing native playback first try { platform.audio.stopPlayback(); } catch (_error) { // Ignore errors when stopping (might not be playing) debug.log('No existing playback to stop'); } // Collect all device IDs from assigned channels const assignedChannels = channels.filter((ch) => profileChannels.channel_ids.includes(ch.id), ); const deviceIds = assignedChannels.flatMap((ch) => ch.device_ids); if (deviceIds.length > 0) { // Fetch audio data const response = await fetch(audioUrl); const audioData = new Uint8Array(await response.arrayBuffer()); // Play via native audio await platform.audio.playToDevices(audioData, deviceIds); // Mark that we're using native playback isUsingNativePlaybackRef.current = true; // Mute WaveSurfer and start it for visualization wavesurferRef.current.setVolume(0); wavesurferRef.current.setMuted(true); // Start WaveSurfer for visualization (muted) wavesurferRef.current.play().catch((error) => { debug.error('Failed to start WaveSurfer visualization:', error); setIsPlaying(false); setError(`Playback error: ${error instanceof Error ? error.message : String(error)}`); }); return; } } catch (error) { debug.error('Native playback failed, falling back to WaveSurfer:', error); // Fall through to WaveSurfer playback isUsingNativePlaybackRef.current = false; } } // Standard WaveSurfer playback (or fallback from native playback failure) if (wavesurferRef.current.isPlaying()) { wavesurferRef.current.pause(); } else { // Ensure WaveSurfer is not muted if not using native playback if (!isUsingNativePlaybackRef.current) { wavesurferRef.current.setMuted(false); wavesurferRef.current.setVolume(volume); } wavesurferRef.current.play().catch((error) => { debug.error('Failed to play:', error); setIsPlaying(false); setError(`Playback error: ${error instanceof Error ? error.message : String(error)}`); }); } }; const handleSeek = (value: number[]) => { if (!wavesurferRef.current || duration === 0) return; const progress = value[0] / 100; wavesurferRef.current.seekTo(progress); }; const handleVolumeChange = (value: number[]) => { setVolume(value[0] / 100); }; const handleClose = () => { // Stop any native playback if (isUsingNativePlaybackRef.current && platform.metadata.isTauri) { try { platform.audio.stopPlayback(); } catch (error) { debug.error('Failed to stop native playback:', error); } } // Stop WaveSurfer if (wavesurferRef.current) { wavesurferRef.current.pause(); wavesurferRef.current.seekTo(0); } // Reset player state reset(); }; // Don't render if no audio if (!audioUrl) { return null; } return (
{/* Play/Pause Button */} {/* Waveform */}
0 ? [(currentTime / duration) * 100] : [0]} onValueChange={handleSeek} max={100} step={0.1} className="w-full" aria-label="Playback position" aria-valuetext={`${formatAudioDuration(currentTime)} of ${formatAudioDuration(duration)}`} /> {error &&
{error}
}
{/* Time Display */}
{formatAudioDuration(currentTime)} / {formatAudioDuration(duration)}
{/* Loop Button */} {/* Volume Control */}
Volume level, {Math.round(volume * 100)}%
{/* Close Button */}
); } ================================================ FILE: app/src/components/AudioStudio/.gitkeep ================================================ # Audio studio timeline editing components ================================================ FILE: app/src/components/AudioTab/AudioTab.tsx ================================================ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { Check, CheckCircle2, Edit, Plus, Speaker, Trash2 } from 'lucide-react'; import { useState } from 'react'; import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle, } from '@/components/ui/dialog'; import { Input } from '@/components/ui/input'; import { Label } from '@/components/ui/label'; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue, } from '@/components/ui/select'; import { apiClient } from '@/lib/api/client'; import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui'; import { cn } from '@/lib/utils/cn'; import { usePlatform } from '@/platform/PlatformContext'; import { usePlayerStore } from '@/stores/playerStore'; interface AudioDevice { id: string; name: string; is_default: boolean; } export function AudioTab() { const platform = usePlatform(); const [createDialogOpen, setCreateDialogOpen] = useState(false); const [editingChannel, setEditingChannel] = useState(null); const [selectedChannelId, setSelectedChannelId] = useState(null); const queryClient = useQueryClient(); const audioUrl = usePlayerStore((state) => state.audioUrl); const isPlayerVisible = !!audioUrl; const { data: channels, isLoading: channelsLoading } = useQuery({ queryKey: ['channels'], queryFn: () => apiClient.listChannels(), }); const { data: devices, isLoading: devicesLoading } = useQuery({ queryKey: ['audio-devices'], queryFn: async () => { if (!platform.metadata.isTauri) { return []; } try { return await platform.audio.listOutputDevices(); } catch (error) { console.error('Failed to list audio devices:', error); return []; } }, enabled: platform.metadata.isTauri, }); const { data: profiles } = useQuery({ queryKey: ['profiles'], queryFn: () => apiClient.listProfiles(), }); const createChannel = useMutation({ mutationFn: (data: { name: string; device_ids: string[] }) => apiClient.createChannel(data), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['channels'] }); setCreateDialogOpen(false); }, }); const updateChannel = useMutation({ mutationFn: ({ channelId, data, }: { channelId: string; data: { name?: string; device_ids?: string[] }; }) => apiClient.updateChannel(channelId, data), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['channels'] }); queryClient.invalidateQueries({ queryKey: ['profile-channels'] }); setEditingChannel(null); }, }); const deleteChannel = useMutation({ mutationFn: (channelId: string) => apiClient.deleteChannel(channelId), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['channels'] }); queryClient.invalidateQueries({ queryKey: ['profile-channels'] }); }, }); const { data: channelVoices } = useQuery({ queryKey: ['channel-voices', editingChannel], queryFn: async () => { if (!editingChannel) return { profile_ids: [] }; return apiClient.getChannelVoices(editingChannel); }, enabled: !!editingChannel, }); const setChannelVoices = useMutation({ mutationFn: ({ channelId, profileIds }: { channelId: string; profileIds: string[] }) => apiClient.setChannelVoices(channelId, profileIds), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['channel-voices'] }); queryClient.invalidateQueries({ queryKey: ['profile-channels'] }); }, }); if (channelsLoading || devicesLoading) { return (
Loading...
); } const handleChannelDelete = async (e, channelId) => { e.stopPropagation(); if (await confirm('Delete this channel?')) { deleteChannel.mutate(channelId); } }; const allChannels = channels || []; const allDevices = devices || []; const selectedChannel = selectedChannelId ? allChannels.find((c) => c.id === selectedChannelId) : null; return (

Audio Channels

{/* Left Column - Channels */}
{allChannels.length === 0 ? (

No audio channels yet. Create your first channel to route voices to specific devices.

) : (
{allChannels.map((channel) => { const isSelected = selectedChannelId === channel.id; return (
)}
); })}
)}
{/* Right Column - Available Devices */}

Available Devices

{selectedChannelId ? selectedChannel?.is_default ? 'Default channel uses system default device' : 'Click devices to add or remove them from the selected channel' : 'Select a channel to assign devices'}

{allDevices.length > 0 ? (
{allDevices.map((device) => { const isConnected = selectedChannelId && selectedChannel && (selectedChannel.device_ids.length === 0 ? device.is_default : selectedChannel.device_ids.includes(device.id)); const canToggle = selectedChannelId && selectedChannel && !selectedChannel.is_default; const handleDeviceClick = () => { if (!canToggle || !selectedChannel) return; const currentDeviceIds = selectedChannel.device_ids; const newDeviceIds = isConnected ? currentDeviceIds.filter((id) => id !== device.id) : [...currentDeviceIds, device.id]; updateChannel.mutate({ channelId: selectedChannelId, data: { device_ids: newDeviceIds }, }); }; return ( ); })}
) : (

{platform.metadata.isTauri ? 'No audio devices found' : 'Audio device selection requires Tauri'}

)}
{/* Create Channel Dialog */} { createChannel.mutate({ name, device_ids: deviceIds }); }} /> {/* Edit Channel Dialog */} {editingChannel && (() => { const channel = channels?.find((c) => c.id === editingChannel); return channel ? ( !open && setEditingChannel(null)} channel={channel} devices={devices || []} profiles={profiles || []} channelVoices={channelVoices?.profile_ids || []} onUpdate={(name, deviceIds) => { updateChannel.mutate({ channelId: editingChannel, data: { name, device_ids: deviceIds }, }); }} onSetVoices={(profileIds) => { setChannelVoices.mutate({ channelId: editingChannel, profileIds, }); }} /> ) : null; })()}
); } function ChannelVoicesList({ channelId }: { channelId: string }) { const { data: voices } = useQuery({ queryKey: ['channel-voices', channelId], queryFn: () => apiClient.getChannelVoices(channelId), }); const { data: profiles } = useQuery({ queryKey: ['profiles'], queryFn: () => apiClient.listProfiles(), }); const voiceNames = voices?.profile_ids.map((id) => profiles?.find((p) => p.id === id)?.name).filter(Boolean) || []; return (
{voiceNames.length > 0 ? ( voiceNames.map((name) => ( {name} )) ) : ( No voices assigned )}
); } interface CreateChannelDialogProps { open: boolean; onOpenChange: (open: boolean) => void; devices: AudioDevice[]; onCreate: (name: string, deviceIds: string[]) => void; } function CreateChannelDialog({ open, onOpenChange, devices, onCreate }: CreateChannelDialogProps) { const [name, setName] = useState(''); const [selectedDevices, setSelectedDevices] = useState([]); const handleSubmit = () => { if (name.trim()) { onCreate(name.trim(), selectedDevices); setName(''); setSelectedDevices([]); } }; return ( Create Audio Channel Create a new audio channel (bus) to route voices to specific output devices.
setName(e.target.value)} placeholder="e.g., Virtual Cable, Broadcast" />
{selectedDevices.length > 0 && (
{selectedDevices.map((deviceId) => { const device = devices.find((d) => d.id === deviceId); return (
{device?.name || deviceId}
); })}
)}
); } interface EditChannelDialogProps { open: boolean; onOpenChange: (open: boolean) => void; channel: { id: string; name: string; device_ids: string[]; }; devices: AudioDevice[]; profiles: Array<{ id: string; name: string }>; channelVoices: string[]; onUpdate: (name: string, deviceIds: string[]) => void; onSetVoices: (profileIds: string[]) => void; } function EditChannelDialog({ open, onOpenChange, channel, devices, profiles, channelVoices, onUpdate, onSetVoices, }: EditChannelDialogProps) { const [name, setName] = useState(channel.name); const [selectedDevices, setSelectedDevices] = useState(channel.device_ids); const [selectedVoices, setSelectedVoices] = useState(channelVoices); const handleSubmit = () => { if (name.trim()) { onUpdate(name.trim(), selectedDevices); onSetVoices(selectedVoices); } }; return ( Edit Channel Update channel settings and voice assignments.
setName(e.target.value)} />
{selectedDevices.length > 0 && (
{selectedDevices.map((deviceId) => { const device = devices.find((d) => d.id === deviceId); return (
{device?.name || deviceId}
); })}
)}
{selectedVoices.length > 0 && (
{selectedVoices.map((profileId) => { const profile = profiles.find((p) => p.id === profileId); return (
{profile?.name || profileId}
); })}
)}
); } ================================================ FILE: app/src/components/Effects/EffectsChainEditor.tsx ================================================ import { closestCenter, DndContext, type DragEndEvent, KeyboardSensor, PointerSensor, useSensor, useSensors, } from '@dnd-kit/core'; import { arrayMove, SortableContext, sortableKeyboardCoordinates, useSortable, verticalListSortingStrategy, } from '@dnd-kit/sortable'; import { CSS } from '@dnd-kit/utilities'; import { useQuery } from '@tanstack/react-query'; import { ChevronDown, ChevronRight, GripVertical, Plus, Power, Trash2 } from 'lucide-react'; import { useCallback, useMemo, useRef, useState } from 'react'; import { Button } from '@/components/ui/button'; import { Label } from '@/components/ui/label'; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue, } from '@/components/ui/select'; import { Slider } from '@/components/ui/slider'; import { apiClient } from '@/lib/api/client'; import type { AvailableEffect, EffectConfig, EffectPresetResponse } from '@/lib/api/types'; import { cn } from '@/lib/utils/cn'; // Each effect in the chain gets a stable ID for dnd-kit interface EffectWithId extends EffectConfig { _id: string; } let nextId = 0; function makeId() { return `fx-${++nextId}`; } interface EffectsChainEditorProps { value: EffectConfig[]; onChange: (chain: EffectConfig[]) => void; compact?: boolean; showPresets?: boolean; } export function EffectsChainEditor({ value, onChange, compact = false, showPresets = true, }: EffectsChainEditorProps) { const [expandedId, setExpandedId] = useState(null); // Maintain stable IDs for each effect across renders. // We use a ref to map value items to IDs, rebuilding when length changes. const idsRef = useRef([]); const items: EffectWithId[] = useMemo(() => { // Grow ID array if effects were added while (idsRef.current.length < value.length) { idsRef.current.push(makeId()); } // Shrink if effects were removed if (idsRef.current.length > value.length) { idsRef.current = idsRef.current.slice(0, value.length); } return value.map((e, i) => ({ ...e, _id: idsRef.current[i] })); }, [value]); const sensors = useSensors( useSensor(PointerSensor, { activationConstraint: { distance: 5 } }), useSensor(KeyboardSensor, { coordinateGetter: sortableKeyboardCoordinates }), ); const { data: availableEffects } = useQuery({ queryKey: ['available-effects'], queryFn: () => apiClient.getAvailableEffects(), staleTime: Infinity, }); const { data: presets } = useQuery({ queryKey: ['effect-presets'], queryFn: () => apiClient.listEffectPresets(), staleTime: 30_000, }); const effectsMap = useMemo(() => { const m = new Map(); if (availableEffects) { for (const e of availableEffects.effects) { m.set(e.type, e); } } return m; }, [availableEffects]); function addEffect(type: string) { const def = effectsMap.get(type); if (!def) return; const params: Record = {}; for (const [key, p] of Object.entries(def.params)) { params[key] = p.default; } const newEffect: EffectConfig = { type, enabled: true, params }; const newId = makeId(); idsRef.current = [...idsRef.current, newId]; onChange([...value, newEffect]); setExpandedId(newId); } const removeEffect = useCallback( (index: number) => { const removedId = idsRef.current[index]; idsRef.current = idsRef.current.filter((_, i) => i !== index); onChange(value.filter((_, i) => i !== index)); if (expandedId === removedId) setExpandedId(null); }, [value, onChange, expandedId], ); const toggleEnabled = useCallback( (index: number) => { onChange(value.map((e, i) => (i === index ? { ...e, enabled: !e.enabled } : e))); }, [value, onChange], ); const updateParam = useCallback( (index: number, paramName: string, paramValue: number) => { onChange( value.map((e, i) => i === index ? { ...e, params: { ...e.params, [paramName]: paramValue } } : e, ), ); }, [value, onChange], ); function loadPreset(preset: EffectPresetResponse) { idsRef.current = preset.effects_chain.map(() => makeId()); onChange(preset.effects_chain); setExpandedId(null); } function clearAll() { idsRef.current = []; onChange([]); setExpandedId(null); } function handleDragEnd(event: DragEndEvent) { const { active, over } = event; if (!over || active.id === over.id) return; const oldIndex = idsRef.current.indexOf(active.id as string); const newIndex = idsRef.current.indexOf(over.id as string); if (oldIndex === -1 || newIndex === -1) return; idsRef.current = arrayMove(idsRef.current, oldIndex, newIndex); onChange(arrayMove([...value], oldIndex, newIndex)); } return (
{/* Preset selector row */} {showPresets && (
{value.length > 0 && ( )}
)} {/* Sortable effects chain */} i._id)} strategy={verticalListSortingStrategy}> {items.map((effect, index) => ( setExpandedId(expandedId === effect._id ? null : effect._id)} onRemove={() => removeEffect(index)} onToggleEnabled={() => toggleEnabled(index)} onUpdateParam={(paramName, paramValue) => updateParam(index, paramName, paramValue)} /> ))} {/* Add effect */} {availableEffects && ( )}
); } // --------------------------------------------------------------------------- // Sortable effect item // --------------------------------------------------------------------------- interface SortableEffectItemProps { id: string; effect: EffectConfig; index: number; effectDef?: AvailableEffect; isExpanded: boolean; onToggleExpand: () => void; onRemove: () => void; onToggleEnabled: () => void; onUpdateParam: (paramName: string, paramValue: number) => void; } function SortableEffectItem({ id, effect, effectDef, isExpanded, onToggleExpand, onRemove, onToggleEnabled, onUpdateParam, }: SortableEffectItemProps) { const { attributes, listeners, setNodeRef, transform, transition, isDragging } = useSortable({ id, }); const style = { transform: CSS.Transform.toString(transform), transition, zIndex: isDragging ? 10 : undefined, }; const label = effectDef?.label ?? effect.type; return (
{/* Header */}
{label}
{/* Params */} {isExpanded && effectDef && (
{Object.entries(effectDef.params).map(([paramName, paramDef]) => { const currentValue = effect.params[paramName] ?? paramDef.default; return (
{currentValue.toFixed( paramDef.step < 1 ? Math.max(1, -Math.floor(Math.log10(paramDef.step))) : 0, )}
onUpdateParam(paramName, v)} />
); })}
)}
); } ================================================ FILE: app/src/components/Effects/GenerationPicker.tsx ================================================ import { ChevronDown, Search } from 'lucide-react'; import { useMemo, useState } from 'react'; import { Button } from '@/components/ui/button'; import { Input } from '@/components/ui/input'; import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; import type { HistoryResponse } from '@/lib/api/types'; import { useHistory } from '@/lib/hooks/useHistory'; import { cn } from '@/lib/utils/cn'; interface GenerationPickerProps { selectedId: string | null; onSelect: (generation: HistoryResponse) => void; className?: string; } export function GenerationPicker({ selectedId, onSelect, className }: GenerationPickerProps) { const [open, setOpen] = useState(false); const [searchQuery, setSearchQuery] = useState(''); const { data: historyData } = useHistory({ limit: 50 }); const completedGenerations = useMemo(() => { if (!historyData?.items) return []; return historyData.items.filter((gen) => gen.status === 'completed'); }, [historyData]); const filtered = useMemo(() => { if (!searchQuery) return completedGenerations; const q = searchQuery.toLowerCase(); return completedGenerations.filter( (gen) => gen.text.toLowerCase().includes(q) || gen.profile_name.toLowerCase().includes(q), ); }, [completedGenerations, searchQuery]); const selectedGeneration = completedGenerations.find((g) => g.id === selectedId); return (
setSearchQuery(e.target.value)} className="h-8 pl-7 text-xs" />
{filtered.length === 0 ? (
No generations found
) : ( filtered.map((gen) => ( )) )}
); } ================================================ FILE: app/src/components/EffectsTab/EffectsDetail.tsx ================================================ import { useQuery, useQueryClient } from '@tanstack/react-query'; import { Loader2, Play, Save, Trash2, Wand2 } from 'lucide-react'; import { useEffect, useRef, useState } from 'react'; import { EffectsChainEditor } from '@/components/Effects/EffectsChainEditor'; import { GenerationPicker } from '@/components/Effects/GenerationPicker'; import { Button } from '@/components/ui/button'; import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle, } from '@/components/ui/dialog'; import { Input } from '@/components/ui/input'; import { Label } from '@/components/ui/label'; import { Separator } from '@/components/ui/separator'; import { Textarea } from '@/components/ui/textarea'; import { useToast } from '@/components/ui/use-toast'; import { apiClient } from '@/lib/api/client'; import type { HistoryResponse } from '@/lib/api/types'; import { useHistory } from '@/lib/hooks/useHistory'; import { useEffectsStore } from '@/stores/effectsStore'; import { usePlayerStore } from '@/stores/playerStore'; export function EffectsDetail() { const selectedPresetId = useEffectsStore((s) => s.selectedPresetId); const isCreatingNew = useEffectsStore((s) => s.isCreatingNew); const workingChain = useEffectsStore((s) => s.workingChain); const setWorkingChain = useEffectsStore((s) => s.setWorkingChain); const setSelectedPresetId = useEffectsStore((s) => s.setSelectedPresetId); const setIsCreatingNew = useEffectsStore((s) => s.setIsCreatingNew); const [name, setName] = useState(''); const [description, setDescription] = useState(''); const [saving, setSaving] = useState(false); const [deleting, setDeleting] = useState(false); // "Save as Custom" dialog state const [saveAsDialogOpen, setSaveAsDialogOpen] = useState(false); const [saveAsName, setSaveAsName] = useState(''); const [saveAsDescription, setSaveAsDescription] = useState(''); // Preview state const [previewGenId, setPreviewGenId] = useState(null); const [previewLoading, setPreviewLoading] = useState(false); const blobUrlRef = useRef(null); const setAudioWithAutoPlay = usePlayerStore((s) => s.setAudioWithAutoPlay); const { toast } = useToast(); const queryClient = useQueryClient(); // Auto-select the most recent generation as preview source const { data: historyData } = useHistory({ limit: 1 }); useEffect(() => { if (!previewGenId && historyData?.items?.length) { const first = historyData.items.find((g) => g.status === 'completed'); if (first) setPreviewGenId(first.id); } }, [historyData, previewGenId]); const { data: preset } = useQuery({ queryKey: ['effect-preset', selectedPresetId], queryFn: () => selectedPresetId ? apiClient .listEffectPresets() .then((all) => all.find((p) => p.id === selectedPresetId) ?? null) : null, enabled: !!selectedPresetId, staleTime: 30_000, }); // Sync name/description when selecting a preset useEffect(() => { if (preset) { setName(preset.name); setDescription(preset.description ?? ''); } else if (isCreatingNew) { setName(''); setDescription(''); } }, [preset, isCreatingNew]); // Cleanup blob URL on unmount useEffect(() => { return () => { if (blobUrlRef.current) { URL.revokeObjectURL(blobUrlRef.current); blobUrlRef.current = null; } }; }, []); const isEditing = !!selectedPresetId || isCreatingNew; const isBuiltIn = preset?.is_builtin ?? false; async function handlePreview() { if (!previewGenId || workingChain.length === 0) return; setPreviewLoading(true); try { const blob = await apiClient.previewEffects(previewGenId, workingChain); // Revoke old blob URL if (blobUrlRef.current) { URL.revokeObjectURL(blobUrlRef.current); } const url = URL.createObjectURL(blob); blobUrlRef.current = url; // Play through the main audio player setAudioWithAutoPlay(url, `preview-${Date.now()}`, null, 'Effects Preview'); } catch (error) { toast({ title: 'Preview failed', description: error instanceof Error ? error.message : 'Unknown error', variant: 'destructive', }); } finally { setPreviewLoading(false); } } function handleSelectGeneration(gen: HistoryResponse) { setPreviewGenId(gen.id); } async function handleSaveNew() { if (!name.trim()) { toast({ title: 'Name required', variant: 'destructive' }); return; } setSaving(true); try { const created = await apiClient.createEffectPreset({ name: name.trim(), description: description.trim() || undefined, effects_chain: workingChain, }); queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); setIsCreatingNew(false); setSelectedPresetId(created.id); toast({ title: 'Preset saved', description: `"${created.name}" has been created.` }); } catch (error) { toast({ title: 'Failed to save', description: error instanceof Error ? error.message : 'Unknown error', variant: 'destructive', }); } finally { setSaving(false); } } async function handleSaveExisting() { if (!selectedPresetId || !name.trim()) return; setSaving(true); try { await apiClient.updateEffectPreset(selectedPresetId, { name: name.trim(), description: description.trim() || undefined, effects_chain: workingChain, }); queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); queryClient.invalidateQueries({ queryKey: ['effect-preset', selectedPresetId] }); toast({ title: 'Preset updated' }); } catch (error) { toast({ title: 'Failed to save', description: error instanceof Error ? error.message : 'Unknown error', variant: 'destructive', }); } finally { setSaving(false); } } function handleSaveAsNew() { // Open the dialog with a suggested name based on the current preset setSaveAsName(`${name} (Copy)`); setSaveAsDescription(description); setSaveAsDialogOpen(true); } async function handleSaveAsConfirm() { if (!saveAsName.trim()) { toast({ title: 'Name required', variant: 'destructive' }); return; } setSaving(true); try { const created = await apiClient.createEffectPreset({ name: saveAsName.trim(), description: saveAsDescription.trim() || undefined, effects_chain: workingChain, }); queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); setSaveAsDialogOpen(false); setSelectedPresetId(created.id); toast({ title: 'Preset saved', description: `"${created.name}" has been created.` }); } catch (error) { toast({ title: 'Failed to save', description: error instanceof Error ? error.message : 'Unknown error', variant: 'destructive', }); } finally { setSaving(false); } } async function handleDelete() { if (!selectedPresetId) return; setDeleting(true); try { await apiClient.deleteEffectPreset(selectedPresetId); queryClient.invalidateQueries({ queryKey: ['effect-presets'] }); setSelectedPresetId(null); setWorkingChain([]); toast({ title: 'Preset deleted' }); } catch (error) { toast({ title: 'Failed to delete', description: error instanceof Error ? error.message : 'Unknown error', variant: 'destructive', }); } finally { setDeleting(false); } } if (!isEditing) { return (

Select a preset or create a new one

); } return (
{/* Header */}

{isCreatingNew ? 'New Preset' : isBuiltIn ? preset?.name : 'Edit Preset'}

{!isBuiltIn && !isCreatingNew && ( <> )} {isCreatingNew && ( )} {isBuiltIn && ( )}
{/* Scrollable content */}
{/* Name & description */} {(isCreatingNew || !isBuiltIn) && (
setName(e.target.value)} placeholder="My preset..." className="h-9" />