Repository: 0x0funky/audioghost-ai Branch: main Commit: 2f309f19f9d2 Files: 37 Total size: 209.8 KB Directory structure: gitextract_y5kuu04t/ ├── .gitignore ├── LICENSE ├── QUICKSTART.md ├── README.md ├── backend/ │ ├── api/ │ │ ├── __init__.py │ │ ├── auth.py │ │ ├── separate.py │ │ └── tasks.py │ ├── main.py │ ├── requirements.txt │ └── workers/ │ ├── __init__.py │ ├── celery_app.py │ └── tasks.py ├── docker-compose.yml ├── frontend/ │ ├── .gitignore │ ├── README.md │ ├── eslint.config.mjs │ ├── next.config.ts │ ├── package.json │ ├── postcss.config.mjs │ ├── src/ │ │ ├── app/ │ │ │ ├── globals.css │ │ │ ├── layout.tsx │ │ │ └── page.tsx │ │ └── components/ │ │ ├── AudioUploader.tsx │ │ ├── AuthModal.tsx │ │ ├── Header.tsx │ │ ├── ProgressTracker.tsx │ │ ├── SeparationPanel.tsx │ │ ├── StemMixer.tsx │ │ ├── VideoStemMixer.tsx │ │ └── WaveformEditor.tsx │ └── tsconfig.json ├── install.bat ├── sam_audio_lite.py ├── start.bat ├── stop.bat └── test_video_only.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Python __pycache__/ *.py[cod] *$py.class *.so *.egg *.egg-info/ dist/ build/ eggs/ *.manifest *.spec pip-log.txt pip-delete-this-directory.txt # Virtual environments venv/ env/ ENV/ .venv/ # IDEs .idea/ .vscode/ *.swp *.swo *~ .project .pydevproject .settings/ # SAM Audio original repo (install from pip instead) sam_audio/ eval/ examples/ assets/ .github/ .checkpoints/ checkpoints/ sam_audio.egg-info/ # Test files and outputs *.mp3 *.wav *.mp4 output_*.wav output_*.mp4 test.mp3 test_audio.wav office.mp4 # Test scripts (keep sam_audio_lite.py for reference) test_small.py test_video.py # Secrets .hf_token .env *.env # Jupyter .ipynb_checkpoints/ *.ipynb # OS .DS_Store Thumbs.db # Node.js (frontend) frontend/node_modules/ frontend/.next/ frontend/out/ # Redis/Celery redis/ *.rdb celerybeat-schedule celerybeat.pid # Logs *.log logs/ # Uploads/Outputs (runtime generated) backend/uploads/ backend/outputs/ # Original repo files (not needed for our fork) CODE_OF_CONDUCT.md CONTRIBUTING.md .pre-commit-config.yaml pyproject.toml ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 AudioGhost AI Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- ## Third-Party Licenses This project uses the following third-party components: ### SAM-Audio SAM-Audio is developed by Meta AI Research and is subject to Meta's research license. See https://github.com/facebookresearch/sam-audio for more information. ================================================ FILE: QUICKSTART.md ================================================ # AudioGhost AI 啟動指南 ## 快速啟動 ### 1. 啟動 Redis (使用 Docker) ```powershell cd d:\sam_audio docker-compose up -d ``` ### 2. 建立 Anaconda 環境 ```powershell # 建立新環境 (Python 3.11+ 必要) conda create -n audioghost python=3.11 -y # 啟動環境 conda activate audioghost ``` ### 3. 安裝 PyTorch + xformers (CUDA 12.4) ```powershell pip install torch==2.9.0+cu126 torchvision==0.24.0+cu126 torchaudio==2.9.0+cu126 --index-url https://download.pytorch.org/whl/cu126 --extra-index-url https://pypi.org/simple ``` ### 4. 安裝 FFmpeg (TorchCodec 需要) ```powershell conda install -c conda-forge ffmpeg -y ``` ### 5. 安裝 SAM Audio ```powershell cd d:\sam_audio pip install . ``` ### 6. 安裝 Backend 依賴 ```powershell cd d:\sam_audio\backend pip install -r requirements.txt ``` ### 7. 啟動 Backend API ```powershell cd d:\sam_audio\backend uvicorn main:app --reload --port 8000 ``` ### 8. 啟動 Celery Worker (新終端機) ```powershell conda activate audioghost cd d:\sam_audio\backend celery -A workers.celery_app worker --loglevel=info --pool=solo ``` ### 9. 啟動 Frontend (新終端機) ```powershell cd d:\sam_audio\frontend npm run dev ``` ### 10. 開啟瀏覽器 訪問 http://localhost:3000 ## 首次使用 1. 點擊右上角 "Connect HuggingFace" 按鈕 2. 前往 https://huggingface.co/facebook/sam-audio-large 申請存取權限 3. 建立 Access Token: https://huggingface.co/settings/tokens 4. 將 Token 貼入並連接 ## 功能使用 - **上傳音訊**:拖放或點擊上傳區域 - **語意分離**:選擇快捷按鈕或輸入自訂描述 - **時間鎖定**:在波形圖上選取區域 - **三軌輸出**:Original / Ghost / Clean ================================================ FILE: README.md ================================================ # AudioGhost AI 🎵👻 ![AudioGhost Banner](banner.png) **AI-Powered Object-Oriented Audio Separation** Describe the sound you want to extract or remove using natural language. Powered by Meta's [SAM-Audio](https://github.com/facebookresearch/sam-audio) model. ![Demo](https://img.shields.io/badge/status-MVP%20v1.0-green) ![Python](https://img.shields.io/badge/python-3.11+-blue) ![License](https://img.shields.io/badge/license-MIT-lightgrey) ## 🎬 Demo ### Audio Separation https://github.com/user-attachments/assets/49248e25-0c56-46ab-a821-2de7f7016bb6 ### Video Upload https://github.com/user-attachments/assets/6b8c08a8-c84f-4fc3-83ad-5703f474fc1b ## Features - 🎯 **Text-Guided Separation** - Describe what you want to extract: "vocals", "drums", "a dog barking" - 🎬 **Video Upload Support** - Upload videos and extract/remove audio sources (audio extraction only, not vision-based) - 🚀 **Memory Optimized** - Lite mode reduces VRAM from ~11GB to ~4GB - 🎨 **Modern UI** - Glassmorphism design with waveform visualization - ⚡ **Real-time Progress** - Track separation progress in real-time - 🎛️ **Stem Mixer** - Preview and compare original, extracted, and residual audio ## 🗺️ Roadmap - 🖱️ **Visual Prompting** - Click on video to select sound sources visually (Integration with [SAM 2](https://github.com/facebookresearch/sam2)) ## Architecture ``` ┌─────────────────────────────────────────────────┐ │ Frontend │ │ (Next.js + Tailwind v4) │ └──────────────────────┬──────────────────────────┘ │ ┌──────────────────────▼──────────────────────────┐ │ Backend API │ │ (FastAPI + Python) │ └──────────────────────┬──────────────────────────┘ │ ┌──────────────────────▼──────────────────────────┐ │ Task Queue │ │ (Celery + Redis) │ └──────────────────────┬──────────────────────────┘ │ ┌──────────────────────▼──────────────────────────┐ │ SAM Audio Lite │ │ (Memory-optimized Meta SAM-Audio) │ └─────────────────────────────────────────────────┘ ``` ## Requirements - **Python 3.11+** - **CUDA-compatible GPU** (4GB+ VRAM for lite mode, 12GB+ for full mode) - **CUDA 12.6** (recommended) - **Node.js 18+** (for frontend) > 💡 FFmpeg and Redis are automatically installed by the installer. ## 🚀 One-Click Installation (Recommended) ### First Time Setup ```bash # Run installer (creates Conda env, downloads Redis, installs all dependencies) install.bat ``` ### Daily Usage ```bash # Start all services with one click start.bat # Stop all services stop.bat ``` --- ## Manual Setup (Advanced) ### 1. Start Redis Redis is automatically downloaded to `redis/` folder by `install.bat`. If you prefer Docker: ```bash docker-compose up -d ``` ### 2. Create Anaconda Environment ```bash # Create new environment (Python 3.11+ required) conda create -n audioghost python=3.11 -y # Activate environment conda activate audioghost ``` ### 3. Install PyTorch (CUDA 12.6) ```bash pip install torch==2.9.0+cu126 torchvision==0.24.0+cu126 torchaudio==2.9.0+cu126 --index-url https://download.pytorch.org/whl/cu126 --extra-index-url https://pypi.org/simple ``` ### 4. Install FFmpeg (required by TorchCodec) ```bash conda install -c conda-forge ffmpeg -y ``` ### 5. Install SAM Audio ```bash pip install git+https://github.com/facebookresearch/sam-audio.git ``` ### 6. Install Backend Dependencies ```bash cd backend pip install -r requirements.txt ``` ### 7. Install Frontend Dependencies ```bash cd frontend npm install ``` ### 8. Start Services **Terminal 1 - Backend API:** ```bash cd backend uvicorn main:app --reload --port 8000 ``` **Terminal 2 - Celery Worker:** ```bash conda activate audioghost cd backend celery -A workers.celery_app worker --loglevel=info --pool=solo ``` **Terminal 3 - Frontend:** ```bash cd frontend npm run dev ``` ### 9. Open the App Navigate to `http://localhost:3000` ### 10. Connect HuggingFace 1. Click "Connect HuggingFace" button 2. Request access at https://huggingface.co/facebook/sam-audio-large 3. Create Access Token: https://huggingface.co/settings/tokens 4. Paste the token and connect ## Usage 1. **Upload** an audio file (MP3, WAV, FLAC) 2. **Describe** what you want to extract or remove: - "vocals" / "singing voice" - "drums" / "percussion" - "background music" - "a dog barking" - "crowd noise" 3. Click **Extract** or **Remove** 4. Wait for processing 5. **Preview** and **download** the results ## Performance Benchmarks > Tested on RTX 4090 with 4:26 audio (11 chunks @ 25s each) ### VRAM Usage (Lite Mode) | Model | bfloat16 (Default) | float32 (High Quality) | Recommended GPU | |-------|-------------------|------------------------|-----------------| | Small | **~6 GB** | **~10 GB** | RTX 3060 6GB / RTX 3070 8GB | | Base | **~7 GB** | **~13 GB** | RTX 3070/4060 8GB / RTX 4070 12GB | | Large | **~10 GB** | **~20 GB** | RTX 3080/4070 12GB / RTX 4080 16GB | > 💡 **High Quality Mode (float32)**: Better separation quality but uses +2-3GB more VRAM. Enable via the "High Quality Mode" toggle in the UI. ### Processing Time | Model | First Run (incl. model load) | Subsequent Runs | Speed | |-------|------------------------------|-----------------|-------| | Small | ~78s | **~25s** | ~10x realtime | | Base | ~100s | **~29s** | ~9x realtime | | Large | ~130s | **~41s** | ~6.5x realtime | > 💡 First run includes model download and loading. Subsequent runs use cached models. ### Memory Optimization Details AudioGhost uses a "Lite Mode" that removes unused model components: | Component Removed | VRAM Saved | |-------------------|------------| | Vision Encoder | ~2GB | | Visual Ranker | ~2GB | | Text Ranker | ~2GB | | Span Predictor | ~1-2GB | **Total Reduction**: Up to **40% less VRAM** compared to original SAM-Audio This is achieved by: - Disabling video-related features (not needed for audio-only) - Using `predict_spans=False` and `reranking_candidates=1` - Using `bfloat16` precision by default (optional float32 for quality) - 25-second chunking for long audio files ## Project Structure ``` audioghost-ai/ ├── backend/ │ ├── main.py # FastAPI app │ ├── api/ # API routes │ │ ├── auth.py # HuggingFace auth │ │ └── separate.py # Separation endpoints │ └── workers/ │ ├── celery_app.py # Celery config │ └── tasks.py # SAM Audio Lite worker ├── frontend/ │ ├── src/ │ │ ├── app/ # Next.js app │ │ └── components/ # React components │ └── package.json ├── sam_audio_lite.py # Standalone lite version ├── QUICKSTART.md # Quick setup guide └── README.md ``` ## API Reference ### POST /api/separate/ Create a separation task. **Form Data:** - `file` - Audio file - `description` - Text prompt (e.g., "vocals") - `mode` - "extract" or "remove" - `model_size` - "small", "base", or "large" (default: "base") **Response:** ```json { "task_id": "uuid", "status": "pending", "message": "Task submitted successfully" } ``` ### GET /api/separate/{task_id}/status Get task status and progress. ### GET /api/separate/{task_id}/download/{stem} Download result audio (ghost, clean, or original). ## Troubleshooting ### CUDA Out of Memory - Use `model_size: "small"` instead of "base" or "large" - Ensure lite mode is enabled (check for "Optimizing model for low VRAM" in logs) - Close other GPU applications ### TorchCodec DLL Error - Downgrade to FFmpeg 7.x - Ensure FFmpeg `bin` directory is in PATH ### HuggingFace 401 Error - Re-authenticate via the UI - Check that `.hf_token` exists in `backend/` ## License This project is licensed under the MIT License. SAM-Audio is licensed by Meta under a research license. ## Credits - [SAM-Audio](https://github.com/facebookresearch/sam-audio) by Meta AI Research - **Core Optimization Logic**: Special thanks to [NilanEkanayake](https://github.com/NilanEkanayake) for providing the initial code modifications in [Issue #24](https://github.com/facebookresearch/sam-audio/issues/24) that made VRAM inference reduction possible. - Built with ❤️ using Next.js, FastAPI, and Celery ================================================ FILE: backend/api/__init__.py ================================================ """API Package""" ================================================ FILE: backend/api/auth.py ================================================ """ Authentication API - HuggingFace Token Management """ import os from pathlib import Path from typing import Optional from fastapi import APIRouter, HTTPException from pydantic import BaseModel from huggingface_hub import HfApi, hf_hub_download from huggingface_hub.utils import HfHubHTTPError router = APIRouter() # Token storage path (use absolute path based on this file's location) BACKEND_DIR = Path(__file__).parent.parent TOKEN_FILE = BACKEND_DIR / ".hf_token" CHECKPOINTS_DIR = BACKEND_DIR / "checkpoints" class TokenRequest(BaseModel): token: str class AuthStatus(BaseModel): authenticated: bool model_downloaded: bool model_name: Optional[str] = None def get_saved_token() -> Optional[str]: """Get saved HuggingFace token""" if TOKEN_FILE.exists(): return TOKEN_FILE.read_text().strip() return os.environ.get("HF_TOKEN") def save_token(token: str): """Save HuggingFace token""" TOKEN_FILE.write_text(token) def check_model_downloaded() -> bool: """Check if SAM Audio model is downloaded""" # Check for common model files model_files = list(CHECKPOINTS_DIR.glob("*.safetensors")) + \ list(CHECKPOINTS_DIR.glob("*.bin")) return len(model_files) > 0 @router.get("/status", response_model=AuthStatus) async def get_auth_status(): """Check authentication and model status""" token = get_saved_token() authenticated = False if token: try: api = HfApi(token=token) api.whoami() authenticated = True except Exception: authenticated = False return AuthStatus( authenticated=authenticated, model_downloaded=check_model_downloaded(), model_name="facebook/sam-audio-large" if check_model_downloaded() else None ) @router.post("/login") async def login(request: TokenRequest): """Validate and save HuggingFace token""" try: # Validate token api = HfApi(token=request.token) user_info = api.whoami() # Check if user has access to SAM Audio try: api.model_info("facebook/sam-audio-large", token=request.token) except HfHubHTTPError as e: if "403" in str(e) or "401" in str(e): raise HTTPException( status_code=403, detail="You need to request access to facebook/sam-audio-large on HuggingFace first" ) raise # Save token save_token(request.token) return { "success": True, "username": user_info.get("name", "Unknown"), "message": "Successfully authenticated" } except HTTPException: raise except Exception as e: raise HTTPException(status_code=401, detail=f"Invalid token: {str(e)}") @router.post("/download-model") async def download_model(): """Download SAM Audio model""" token = get_saved_token() if not token: raise HTTPException(status_code=401, detail="Not authenticated") try: # Note: In production, this should be a background task # For MVP, we'll use the HuggingFace auto-download feature # which downloads on first use return { "success": True, "message": "Model will be downloaded automatically on first use" } except Exception as e: raise HTTPException(status_code=500, detail=f"Download failed: {str(e)}") @router.post("/logout") async def logout(): """Clear saved token""" if TOKEN_FILE.exists(): TOKEN_FILE.unlink() return {"success": True, "message": "Logged out"} ================================================ FILE: backend/api/separate.py ================================================ """ Separation API - Audio/Video Separation Endpoints """ import uuid from pathlib import Path from typing import Optional, List from fastapi import APIRouter, UploadFile, File, Form, HTTPException from pydantic import BaseModel from workers.celery_app import celery_app from workers.tasks import separate_audio_task router = APIRouter() UPLOAD_DIR = Path("uploads") # Supported MIME types AUDIO_TYPES = ["audio/mpeg", "audio/wav", "audio/mp3", "audio/x-wav", "audio/flac", "audio/m4a", "audio/aac"] VIDEO_TYPES = ["video/mp4", "video/webm", "video/quicktime", "video/x-msvideo", "video/mpeg", "video/x-matroska"] VIDEO_EXTENSIONS = [".mp4", ".webm", ".mov", ".avi", ".mkv", ".mpeg"] class SeparationRequest(BaseModel): description: str mode: str = "extract" # "extract" or "remove" start_time: Optional[float] = None end_time: Optional[float] = None model_size: str = "base" # "small", "base", "large" class SeparationResponse(BaseModel): task_id: str status: str message: str @router.post("/", response_model=SeparationResponse) async def create_separation_task( file: UploadFile = File(...), description: str = Form(...), mode: str = Form("extract"), start_time: Optional[float] = Form(None), end_time: Optional[float] = Form(None), model_size: str = Form("base"), chunk_duration: float = Form(25.0), use_float32: str = Form("false") ): """ Create a new audio/video separation task - **file**: Audio or video file to process (video audio will be extracted) - **description**: Text prompt describing the sound to separate - **mode**: "extract" to isolate the sound, "remove" to remove it - **start_time**: Optional start time for temporal prompting - **end_time**: Optional end time for temporal prompting - **model_size**: SAM Audio model size (small/base/large) - **chunk_duration**: Audio chunk duration in seconds (5-60, default 25) - **use_float32**: Use float32 precision for better quality (default: false) """ # Validate chunk_duration chunk_duration = max(5.0, min(60.0, chunk_duration)) # Parse use_float32 from string to bool use_float32_bool = use_float32.lower() == "true" # Detect if file is video file_extension = Path(file.filename).suffix.lower() if file.filename else "" is_video = ( (file.content_type and file.content_type in VIDEO_TYPES) or file_extension in VIDEO_EXTENSIONS ) # Generate task ID task_id = str(uuid.uuid4()) # Save uploaded file file_extension = Path(file.filename).suffix or ".mp3" upload_path = UPLOAD_DIR / f"{task_id}{file_extension}" with open(upload_path, "wb") as f: content = await file.read() f.write(content) # Build anchors for temporal prompting anchors = None if start_time is not None and end_time is not None: anchors = [[["+", start_time, end_time]]] # Submit Celery task celery_task = separate_audio_task.apply_async( args=[ str(upload_path), description, mode, anchors, model_size, chunk_duration, use_float32_bool, is_video # New: flag for video processing ], task_id=task_id ) return SeparationResponse( task_id=task_id, status="pending", message="Task submitted successfully" ) @router.post("/batch", response_model=List[SeparationResponse]) async def create_batch_separation( file: UploadFile = File(...), descriptions: str = Form(...), # JSON array of descriptions mode: str = Form("extract") ): """ Create multiple separation tasks for the same audio file Useful for separating multiple stems at once """ import json try: desc_list = json.loads(descriptions) except json.JSONDecodeError: raise HTTPException(status_code=400, detail="Invalid descriptions format") # Save file once base_task_id = str(uuid.uuid4()) file_extension = Path(file.filename).suffix or ".mp3" upload_path = UPLOAD_DIR / f"{base_task_id}{file_extension}" with open(upload_path, "wb") as f: content = await file.read() f.write(content) responses = [] for i, desc in enumerate(desc_list): task_id = f"{base_task_id}-{i}" separate_audio_task.apply_async( args=[str(upload_path), desc, mode, None, "small"], task_id=task_id ) responses.append(SeparationResponse( task_id=task_id, status="pending", message=f"Task for '{desc}' submitted" )) return responses ================================================ FILE: backend/api/tasks.py ================================================ """ Tasks API - Task Status and Results """ from pathlib import Path from typing import Optional, List from fastapi import APIRouter, HTTPException from fastapi.responses import FileResponse from pydantic import BaseModel from celery.result import AsyncResult from workers.celery_app import celery_app router = APIRouter() OUTPUT_DIR = Path("outputs") class TaskStatus(BaseModel): task_id: str status: str # pending, processing, completed, failed progress: int # 0-100 message: Optional[str] = None result: Optional[dict] = None class TaskResult(BaseModel): original_url: str ghost_url: str # Separated target clean_url: str # Residual @router.get("/{task_id}", response_model=TaskStatus) async def get_task_status(task_id: str): """Get the status of a separation task""" result = AsyncResult(task_id, app=celery_app) if result.state == "PENDING": return TaskStatus( task_id=task_id, status="pending", progress=0, message="Task is waiting to be processed" ) elif result.state == "PROGRESS": info = result.info or {} return TaskStatus( task_id=task_id, status="processing", progress=info.get("progress", 0), message=info.get("message", "Processing...") ) elif result.state == "SUCCESS": return TaskStatus( task_id=task_id, status="completed", progress=100, message="Task completed successfully", result=result.result ) elif result.state == "FAILURE": return TaskStatus( task_id=task_id, status="failed", progress=0, message=str(result.info) ) else: return TaskStatus( task_id=task_id, status=result.state.lower(), progress=0, message=f"Task state: {result.state}" ) @router.get("/{task_id}/download/{file_type}") async def download_result(task_id: str, file_type: str): """ Download processed audio or video file - **file_type**: "original", "ghost", "clean", or "video" """ if file_type not in ["original", "ghost", "clean", "video"]: raise HTTPException(status_code=400, detail="Invalid file type") result = AsyncResult(task_id, app=celery_app) if result.state != "SUCCESS": raise HTTPException(status_code=404, detail="Task not completed") # Handle video file separately if file_type == "video": video_path = result.result.get("video_path") if not video_path: raise HTTPException(status_code=404, detail="No video file for this task") file_path = Path(video_path) if not file_path.exists(): raise HTTPException(status_code=404, detail="Video file not found") # Determine media type based on extension extension = file_path.suffix.lower() media_types = { ".mp4": "video/mp4", ".webm": "video/webm", ".mov": "video/quicktime", ".avi": "video/x-msvideo", ".mkv": "video/x-matroska" } media_type = media_types.get(extension, "video/mp4") return FileResponse( path=file_path, filename=f"{task_id}_video{extension}", media_type=media_type ) # Handle audio files file_path = Path(result.result.get(f"{file_type}_path", "")) if not file_path.exists(): raise HTTPException(status_code=404, detail="File not found") return FileResponse( path=file_path, filename=f"{task_id}_{file_type}.wav", media_type="audio/wav" ) @router.get("/{task_id}/download-video-with-audio/{audio_type}") async def download_video_with_audio(task_id: str, audio_type: str): """ Download video with merged audio track - **audio_type**: "original", "ghost", or "clean" """ import subprocess import tempfile import os if audio_type not in ["original", "ghost", "clean"]: raise HTTPException(status_code=400, detail="Invalid audio type. Use 'original', 'ghost', or 'clean'") result = AsyncResult(task_id, app=celery_app) if result.state != "SUCCESS": raise HTTPException(status_code=404, detail="Task not completed") # Get video path video_path = result.result.get("video_path") if not video_path: raise HTTPException(status_code=404, detail="No video file for this task") video_file = Path(video_path) if not video_file.exists(): raise HTTPException(status_code=404, detail="Video file not found") # Get audio path audio_path = Path(result.result.get(f"{audio_type}_path", "")) if not audio_path.exists(): raise HTTPException(status_code=404, detail=f"Audio file '{audio_type}' not found") # Create output file in the same directory as video output_dir = video_file.parent extension = video_file.suffix.lower() output_filename = f"{task_id}_{audio_type}_merged{extension}" output_path = output_dir / output_filename # Use FFmpeg to merge video and audio try: cmd = [ "ffmpeg", "-y", "-i", str(video_file), "-i", str(audio_path), "-c:v", "copy", # Copy video stream without re-encoding "-c:a", "aac", # Encode audio to AAC "-b:a", "192k", # Audio bitrate "-map", "0:v:0", # Use video from first input "-map", "1:a:0", # Use audio from second input "-shortest", # Match shortest stream str(output_path) ] subprocess.run(cmd, check=True, capture_output=True) except subprocess.CalledProcessError as e: raise HTTPException(status_code=500, detail=f"FFmpeg error: {e.stderr.decode()}") except FileNotFoundError: raise HTTPException(status_code=500, detail="FFmpeg not found. Please install FFmpeg.") if not output_path.exists(): raise HTTPException(status_code=500, detail="Failed to create merged video") # Determine media type media_types = { ".mp4": "video/mp4", ".webm": "video/webm", ".mov": "video/quicktime", ".avi": "video/x-msvideo", ".mkv": "video/x-matroska" } media_type = media_types.get(extension, "video/mp4") # Map audio type to display name audio_labels = { "original": "original", "ghost": "isolated", "clean": "without_isolated" } return FileResponse( path=output_path, filename=f"{task_id}_{audio_labels[audio_type]}_video{extension}", media_type=media_type ) @router.delete("/{task_id}") async def cancel_task(task_id: str): """Cancel a pending or running task""" result = AsyncResult(task_id, app=celery_app) result.revoke(terminate=True) return {"success": True, "message": "Task cancelled"} @router.get("/", response_model=List[TaskStatus]) async def list_recent_tasks(limit: int = 10): """List recent tasks (simplified - in production would use database)""" # Note: This is a simplified implementation # In production, you would store task metadata in a database return [] ================================================ FILE: backend/main.py ================================================ """ AudioGhost AI - FastAPI Backend """ import os from pathlib import Path from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles from api import auth, separate, tasks # Create necessary directories UPLOAD_DIR = Path("uploads") OUTPUT_DIR = Path("outputs") CHECKPOINTS_DIR = Path("../checkpoints") UPLOAD_DIR.mkdir(exist_ok=True) OUTPUT_DIR.mkdir(exist_ok=True) CHECKPOINTS_DIR.mkdir(exist_ok=True) app = FastAPI( title="AudioGhost AI", description="AI-Powered Audio Separation Tool", version="1.0.0" ) # CORS Configuration app.add_middleware( CORSMiddleware, allow_origins=["http://localhost:3000"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Mount static files for downloads app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") # Include routers app.include_router(auth.router, prefix="/api/auth", tags=["Authentication"]) app.include_router(separate.router, prefix="/api/separate", tags=["Separation"]) app.include_router(tasks.router, prefix="/api/tasks", tags=["Tasks"]) @app.get("/") async def root(): return { "name": "AudioGhost AI", "version": "1.0.0", "status": "running" } @app.get("/health") async def health(): return {"status": "healthy"} ================================================ FILE: backend/requirements.txt ================================================ # AudioGhost AI - Backend Dependencies # FastAPI Framework fastapi==0.115.6 uvicorn[standard]==0.34.0 python-multipart==0.0.19 # Task Queue celery==5.4.0 redis==5.2.1 # Media Processing pydub==0.25.1 # AI Dependencies (SAM Audio already installed from parent) # huggingface_hub is included in SAM Audio deps # Utilities python-dotenv==1.0.1 aiofiles==24.1.0 ================================================ FILE: backend/workers/__init__.py ================================================ """Workers Package""" ================================================ FILE: backend/workers/celery_app.py ================================================ """ Celery Application Configuration """ from celery import Celery celery_app = Celery( "audioghost", broker="redis://localhost:6379/0", backend="redis://localhost:6379/0", include=["workers.tasks"] ) # Celery Configuration celery_app.conf.update( task_serializer="json", accept_content=["json"], result_serializer="json", timezone="UTC", enable_utc=True, task_track_started=True, task_time_limit=3600, # 1 hour max per task worker_prefetch_multiplier=1, # Process one task at a time (GPU memory) result_expires=86400, # Results expire after 24 hours ) ================================================ FILE: backend/workers/tasks.py ================================================ """ Celery Tasks - Audio Separation Workers With SAM Audio Lite optimization for low VRAM usage """ import os import sys import gc from pathlib import Path from typing import Optional, List from celery import current_task from workers.celery_app import celery_app # Add parent directory to path for SAM Audio imports sys.path.insert(0, str(Path(__file__).parent.parent.parent)) OUTPUT_DIR = Path("outputs") OUTPUT_DIR.mkdir(exist_ok=True) # Global model cache to avoid reloading _model_cache = {} _processor_cache = {} def update_progress(progress: int, message: str): """Update task progress""" current_task.update_state( state="PROGRESS", meta={"progress": progress, "message": message} ) def create_lite_model(model_name: str, hf_token: str = None): """ Create a memory-optimized SAM Audio model by removing unused components. Reduces VRAM usage from ~11GB to ~4-5GB by: - Replacing vision_encoder with a dummy - Disabling visual_ranker - Disabling text_ranker - Disabling span_predictor """ import torch from sam_audio import SAMAudio, SAMAudioProcessor print(f"Loading {model_name} (lite mode)...") # Load model if hf_token: model = SAMAudio.from_pretrained(model_name, token=hf_token) else: model = SAMAudio.from_pretrained(model_name) processor = SAMAudioProcessor.from_pretrained(model_name) print("Optimizing model for low VRAM...") # Get vision encoder dim before deleting vision_dim = model.vision_encoder.dim if hasattr(model.vision_encoder, 'dim') else 1024 # Delete heavy components del model.vision_encoder gc.collect() # Store the dim for _get_video_features model._vision_encoder_dim = vision_dim # Replace _get_video_features to not use vision_encoder def _get_video_features_lite(self, video, audio_features): B, T, _ = audio_features.shape return audio_features.new_zeros(B, self._vision_encoder_dim, T) import types model._get_video_features = types.MethodType(_get_video_features_lite, model) # Delete rankers if hasattr(model, 'visual_ranker') and model.visual_ranker is not None: del model.visual_ranker model.visual_ranker = None gc.collect() if hasattr(model, 'text_ranker') and model.text_ranker is not None: del model.text_ranker model.text_ranker = None gc.collect() # Delete span predictor if hasattr(model, 'span_predictor') and model.span_predictor is not None: del model.span_predictor model.span_predictor = None gc.collect() if hasattr(model, 'span_predictor_transform') and model.span_predictor_transform is not None: del model.span_predictor_transform model.span_predictor_transform = None gc.collect() # Force garbage collection gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print("Model optimization complete!") return model, processor def get_or_load_lite_model(model_name: str, hf_token: str, device: str, dtype): """Get cached lite model or create it - only keeps ONE model in memory""" import torch # Include dtype in cache key to ensure correct model is loaded dtype_str = "bf16" if dtype == torch.bfloat16 else "fp32" cache_key = f"{model_name}_lite_{device}_{dtype_str}" print(f"[DEBUG] Looking for cached model with key: {cache_key}") print(f"[DEBUG] Current cache keys: {list(_model_cache.keys())}") if cache_key not in _model_cache: print(f"[DEBUG] Cache miss - creating new lite model") # IMPORTANT: Clear any existing models first to free memory if len(_model_cache) > 0: print(f"[DEBUG] Clearing {len(_model_cache)} existing model(s) from cache...") for old_key in list(_model_cache.keys()): del _model_cache[old_key] for old_key in list(_processor_cache.keys()): del _processor_cache[old_key] gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print(f"[DEBUG] GPU Memory after clearing old models: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") model, processor = create_lite_model(model_name, hf_token) print(f"[DEBUG] Converting model to {device} with dtype {dtype}") model = model.eval().to(device, dtype) _model_cache[cache_key] = model _processor_cache[model_name] = processor if torch.cuda.is_available(): print(f"[DEBUG] GPU Memory after loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") else: print(f"[DEBUG] Cache hit - using existing model") return _model_cache[cache_key], _processor_cache[model_name] def cleanup_gpu_memory(): """Clean up GPU memory after task""" import torch if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() @celery_app.task(bind=True) def separate_audio_task( self, audio_path: str, description: str, mode: str = "extract", anchors: Optional[List] = None, model_size: str = "base", chunk_duration: float = 25.0, use_float32: bool = False, is_video: bool = False ): """ Separate audio using SAM Audio Lite (memory optimized) Args: audio_path: Path to input audio or video file description: Text prompt for separation mode: "extract" or "remove" anchors: Optional temporal anchors [["+", start, end], ...] model_size: Model size (small/base/large) chunk_duration: Audio chunk duration in seconds (5-60) use_float32: Use float32 precision for better quality is_video: If True, extract audio from video file first Returns: Dictionary with paths to output files """ import torch import torchaudio import time import subprocess import shutil from huggingface_hub import login task_id = self.request.id device = "cuda" if torch.cuda.is_available() else "cpu" video_path = None # Will be set if input is video # Debug: Show received parameter print(f"[DEBUG] use_float32 parameter received: {use_float32} (type: {type(use_float32).__name__})") print(f"[DEBUG] is_video parameter received: {is_video}") # Handle video files - extract audio using FFmpeg if is_video: update_progress(2, "Extracting audio from video...") video_path = Path(audio_path) # Copy video to output directory for later playback output_video_path = OUTPUT_DIR / f"{task_id}.video{video_path.suffix}" shutil.copy2(video_path, output_video_path) print(f"[DEBUG] Copied video to: {output_video_path}") # Extract audio from video using FFmpeg extracted_audio_path = OUTPUT_DIR / f"{task_id}.extracted.wav" ffmpeg_cmd = [ "ffmpeg", "-y", "-i", str(video_path), "-vn", # No video "-acodec", "pcm_s16le", # PCM 16-bit "-ar", "44100", # 44.1kHz sample rate "-ac", "1", # Mono str(extracted_audio_path) ] try: result = subprocess.run( ffmpeg_cmd, capture_output=True, text=True, check=True ) print(f"[DEBUG] FFmpeg audio extraction successful") except subprocess.CalledProcessError as e: raise Exception(f"FFmpeg audio extraction failed: {e.stderr}") # Use extracted audio for processing audio_path = str(extracted_audio_path) # Set precision based on use_float32 parameter if use_float32 or device == "cpu": dtype = torch.float32 print(f"[DEBUG] Using float32 precision (High Quality Mode)") else: dtype = torch.bfloat16 print(f"[DEBUG] Using bfloat16 precision (Memory Optimized)") # Start timing start_time = time.time() try: update_progress(5, "Initializing...") # Load HuggingFace token backend_dir = Path(__file__).parent.parent token_file = backend_dir / ".hf_token" if token_file.exists(): with open(token_file, "r") as f: hf_token = f.read().strip() login(token=hf_token) else: raise Exception("HuggingFace token not found. Please authenticate first.") # Select model based on size model_name = f"facebook/sam-audio-{model_size}" update_progress(10, f"Loading {model_name} (lite mode)...") # Clean up before loading cleanup_gpu_memory() # Load lite model (with caching) model, processor = get_or_load_lite_model(model_name, hf_token, device, dtype) update_progress(30, "Loading audio...") # Get sample rate sample_rate = processor.audio_sampling_rate # Load and preprocess audio audio, orig_sr = torchaudio.load(audio_path) if orig_sr != sample_rate: resampler = torchaudio.transforms.Resample(orig_sr, sample_rate) audio = resampler(audio) # Convert to mono if stereo if audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True) # Calculate audio duration audio_duration = audio.shape[1] / sample_rate print(f"[DEBUG] Audio duration: {audio_duration:.2f}s") # Chunking settings (from parameter, clamped to 5-60) CHUNK_DURATION = max(5.0, min(60.0, chunk_duration)) MAX_CHUNK_SAMPLES = int(sample_rate * CHUNK_DURATION) # Check if chunking is needed if audio.shape[1] > MAX_CHUNK_SAMPLES: print(f"[DEBUG] Audio is {audio_duration:.1f}s, using chunking ({CHUNK_DURATION}s chunks)") # Split audio into chunks audio_tensor = audio.squeeze(0).to(device, dtype) chunks = torch.split(audio_tensor, MAX_CHUNK_SAMPLES, dim=-1) total_chunks = len(chunks) out_target = [] out_residual = [] for i, chunk in enumerate(chunks): # Update progress chunk_progress = 30 + int((i / total_chunks) * 50) update_progress(chunk_progress, f"Processing chunk {i+1}/{total_chunks}...") # Skip very short chunks if chunk.shape[-1] < sample_rate: # Less than 1 second print(f"[DEBUG] Skipping chunk {i+1} (too short)") continue # Prepare batch for this chunk batch = processor( audios=[chunk.unsqueeze(0)], descriptions=[description] ).to(device) # Run separation with torch.inference_mode(): with torch.cuda.amp.autocast(enabled=(device == "cuda")): result = model.separate( batch, predict_spans=False, reranking_candidates=1 ) out_target.append(result.target[0].cpu()) out_residual.append(result.residual[0].cpu()) # Clean up chunk results del batch, result if torch.cuda.is_available(): torch.cuda.empty_cache() # Concatenate all chunks target_audio = torch.cat(out_target, dim=-1).clamp(-1, 1).float().unsqueeze(0) residual_audio = torch.cat(out_residual, dim=-1).clamp(-1, 1).float().unsqueeze(0) del out_target, out_residual, chunks, audio_tensor else: print(f"[DEBUG] Audio is {audio_duration:.1f}s, processing as single batch") update_progress(50, "Running separation...") # Process entire audio at once batch = processor( audios=[audio_path], descriptions=[description] ).to(device) # Run separation with torch.inference_mode(): with torch.cuda.amp.autocast(enabled=(device == "cuda")): result = model.separate( batch, predict_spans=False, reranking_candidates=1 ) target_audio = result.target[0].float().unsqueeze(0).cpu() residual_audio = result.residual[0].float().unsqueeze(0).cpu() del batch, result update_progress(80, "Saving results...") # Output paths output_base = OUTPUT_DIR / task_id original_path = output_base.with_suffix(".original.wav") ghost_path = output_base.with_suffix(".ghost.wav") clean_path = output_base.with_suffix(".clean.wav") # Save original audio torchaudio.save(str(original_path), audio.cpu(), sample_rate) # Save separated audio if mode == "extract": torchaudio.save(str(ghost_path), target_audio, sample_rate) torchaudio.save(str(clean_path), residual_audio, sample_rate) else: torchaudio.save(str(ghost_path), target_audio, sample_rate) torchaudio.save(str(clean_path), residual_audio, sample_rate) update_progress(100, "Complete!") # Aggressive cleanup print(f"[DEBUG] Cleaning up GPU memory...") del target_audio, residual_audio, audio gc.collect() cleanup_gpu_memory() if torch.cuda.is_available(): print(f"[DEBUG] GPU Memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") # Calculate processing time processing_time = time.time() - start_time print(f"[DEBUG] Processing completed in {processing_time:.2f}s for {audio_duration:.2f}s audio") result = { "original_path": str(original_path), "ghost_path": str(ghost_path), "clean_path": str(clean_path), "description": description, "mode": mode, "audio_duration": round(audio_duration, 2), "processing_time": round(processing_time, 2), "model_size": model_size } # Add video path if this was a video file if video_path is not None: output_video_path = OUTPUT_DIR / f"{task_id}.video{video_path.suffix}" result["video_path"] = str(output_video_path) result["is_video"] = True return result except Exception as e: gc.collect() cleanup_gpu_memory() raise Exception(f"Separation failed: {str(e)}") @celery_app.task(bind=True) def match_pattern_task( self, audio_path: str, sample_path: str, threshold: float = 0.85, model_size: str = "base" ): """ Find and remove sounds similar to a sample Args: audio_path: Path to input audio file sample_path: Path to sample audio file threshold: Similarity threshold (0-1) model_size: Model size (small/base/large) Returns: Dictionary with paths to output files and matched segments """ # TODO: Implement pattern matching with CLAP embeddings # This is a placeholder for MVP v1.0 update_progress(50, "Pattern matching not yet implemented in MVP") return { "status": "not_implemented", "message": "Pattern matching will be available in v1.1" } ================================================ FILE: docker-compose.yml ================================================ version: '3.8' services: redis: image: redis:alpine container_name: audioghost-redis ports: - "6379:6379" volumes: - redis_data:/data restart: unless-stopped volumes: redis_data: ================================================ FILE: frontend/.gitignore ================================================ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. # dependencies /node_modules /.pnp .pnp.* .yarn/* !.yarn/patches !.yarn/plugins !.yarn/releases !.yarn/versions # testing /coverage # next.js /.next/ /out/ # production /build # misc .DS_Store *.pem # debug npm-debug.log* yarn-debug.log* yarn-error.log* .pnpm-debug.log* # env files (can opt-in for committing if needed) .env* # vercel .vercel # typescript *.tsbuildinfo next-env.d.ts ================================================ FILE: frontend/README.md ================================================ This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app). ## Getting Started First, run the development server: ```bash npm run dev # or yarn dev # or pnpm dev # or bun dev ``` Open [http://localhost:3000](http://localhost:3000) with your browser to see the result. You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file. This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel. ## Learn More To learn more about Next.js, take a look at the following resources: - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API. - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial. You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome! ## Deploy on Vercel The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js. Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details. ================================================ FILE: frontend/eslint.config.mjs ================================================ import { defineConfig, globalIgnores } from "eslint/config"; import nextVitals from "eslint-config-next/core-web-vitals"; import nextTs from "eslint-config-next/typescript"; const eslintConfig = defineConfig([ ...nextVitals, ...nextTs, // Override default ignores of eslint-config-next. globalIgnores([ // Default ignores of eslint-config-next: ".next/**", "out/**", "build/**", "next-env.d.ts", ]), ]); export default eslintConfig; ================================================ FILE: frontend/next.config.ts ================================================ import type { NextConfig } from "next"; const nextConfig: NextConfig = { /* config options here */ }; export default nextConfig; ================================================ FILE: frontend/package.json ================================================ { "name": "frontend", "version": "0.1.0", "private": true, "scripts": { "dev": "next dev", "build": "next build", "start": "next start", "lint": "eslint" }, "dependencies": { "axios": "^1.13.2", "lucide-react": "^0.562.0", "next": "16.1.0", "react": "19.2.3", "react-dom": "19.2.3", "wavesurfer.js": "^7.12.1" }, "devDependencies": { "@tailwindcss/postcss": "^4", "@types/node": "^20", "@types/react": "^19", "@types/react-dom": "^19", "eslint": "^9", "eslint-config-next": "16.1.0", "tailwindcss": "^4", "typescript": "^5" } } ================================================ FILE: frontend/postcss.config.mjs ================================================ const config = { plugins: { "@tailwindcss/postcss": {}, }, }; export default config; ================================================ FILE: frontend/src/app/globals.css ================================================ @import "tailwindcss"; :root { /* AudioGhost Brand Colors */ --ghost-primary: #8B5CF6; --ghost-secondary: #06B6D4; --ghost-accent: #F472B6; --ghost-success: #10B981; --ghost-warning: #F59E0B; --ghost-error: #EF4444; /* Dark Theme */ --bg-primary: #0A0A0F; --bg-secondary: #12121A; --bg-tertiary: #1A1A25; --bg-card: #16161F; --bg-hover: #1E1E2A; /* Text Colors */ --text-primary: #FFFFFF; --text-secondary: #A1A1AA; --text-muted: #71717A; /* Glassmorphism */ --glass-bg: rgba(22, 22, 31, 0.8); --glass-border: rgba(139, 92, 246, 0.2); } * { box-sizing: border-box; padding: 0; margin: 0; } html { scroll-behavior: smooth; } body { min-height: 100vh; background: var(--bg-primary); color: var(--text-primary); font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; } body.light-mode { --bg-primary: #FAFAFA; --bg-secondary: #F5F5F5; --bg-tertiary: #EBEBEB; --bg-card: #FFFFFF; --bg-hover: #F0F0F0; --text-primary: #18181B; --text-secondary: #52525B; --text-muted: #A1A1AA; --glass-bg: rgba(255, 255, 255, 0.9); --glass-border: rgba(139, 92, 246, 0.3); } /* Custom Scrollbar */ ::-webkit-scrollbar { width: 8px; height: 8px; } ::-webkit-scrollbar-track { background: var(--bg-secondary); } ::-webkit-scrollbar-thumb { background: var(--ghost-primary); border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #7C3AED; } /* Glassmorphism Card */ .glass-card { background: var(--glass-bg); backdrop-filter: blur(12px); -webkit-backdrop-filter: blur(12px); border: 1px solid var(--glass-border); border-radius: 16px; } /* Gradient Text */ .gradient-text { background: linear-gradient(135deg, var(--ghost-primary) 0%, var(--ghost-secondary) 50%, var(--ghost-accent) 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; } /* Glow Effects */ .glow-primary { box-shadow: 0 0 20px rgba(139, 92, 246, 0.3); } .glow-secondary { box-shadow: 0 0 20px rgba(6, 182, 212, 0.3); } /* Button Styles */ .btn-primary { background: linear-gradient(135deg, var(--ghost-primary), #7C3AED); color: white; padding: 12px 24px; border-radius: 12px; font-weight: 600; transition: all 0.3s ease; border: none; cursor: pointer; display: inline-flex; align-items: center; justify-content: center; } .btn-primary:hover { transform: translateY(-2px); box-shadow: 0 8px 25px rgba(139, 92, 246, 0.4); } .btn-primary:active { transform: translateY(0); } .btn-secondary { background: var(--bg-tertiary); color: var(--text-primary); padding: 12px 24px; border-radius: 12px; font-weight: 600; transition: all 0.3s ease; border: 1px solid var(--glass-border); cursor: pointer; display: inline-flex; align-items: center; justify-content: center; } .btn-secondary:hover { background: var(--bg-hover); border-color: var(--ghost-primary); } /* Input Styles */ .input-ghost { background: var(--bg-tertiary); border: 1px solid var(--glass-border); border-radius: 12px; padding: 12px 16px; color: var(--text-primary); transition: all 0.3s ease; outline: none; width: 100%; } .input-ghost:focus { border-color: var(--ghost-primary); box-shadow: 0 0 0 3px rgba(139, 92, 246, 0.2); } .input-ghost::placeholder { color: var(--text-muted); } /* Waveform Container */ .waveform-container { background: linear-gradient(180deg, var(--bg-secondary) 0%, var(--bg-tertiary) 100%); border-radius: 16px; padding: 24px; border: 1px solid var(--glass-border); } /* Progress Bar */ .progress-bar { height: 6px; background: var(--bg-tertiary); border-radius: 3px; overflow: hidden; } .progress-bar-fill { height: 100%; background: linear-gradient(90deg, var(--ghost-primary), var(--ghost-secondary)); border-radius: 3px; transition: width 0.3s ease; } /* Animations */ @keyframes pulse-glow { 0%, 100% { box-shadow: 0 0 20px rgba(139, 92, 246, 0.3); } 50% { box-shadow: 0 0 40px rgba(139, 92, 246, 0.5); } } @keyframes float { 0%, 100% { transform: translateY(0); } 50% { transform: translateY(-10px); } } @keyframes shimmer { 0% { background-position: -200% 0; } 100% { background-position: 200% 0; } } .animate-pulse-glow { animation: pulse-glow 2s ease-in-out infinite; } .animate-float { animation: float 3s ease-in-out infinite; } .shimmer { background: linear-gradient(90deg, var(--bg-tertiary) 25%, var(--bg-hover) 50%, var(--bg-tertiary) 75%); background-size: 200% 100%; animation: shimmer 1.5s infinite; } /* Stem Mixer Track */ .stem-track { background: var(--bg-secondary); border-radius: 12px; padding: 16px; border: 1px solid var(--glass-border); transition: all 0.3s ease; } .stem-track:hover { border-color: var(--ghost-primary); } .stem-track.original { border-left: 3px solid var(--ghost-primary); } .stem-track.ghost { border-left: 3px solid var(--ghost-accent); } .stem-track.clean { border-left: 3px solid var(--ghost-success); } /* Quick Action Tags */ .quick-tag { display: inline-flex; align-items: center; gap: 6px; padding: 8px 16px; background: var(--bg-tertiary); border: 1px solid var(--glass-border); border-radius: 20px; font-size: 14px; color: var(--text-secondary); cursor: pointer; transition: all 0.3s ease; } .quick-tag:hover { background: var(--bg-hover); color: var(--text-primary); border-color: var(--ghost-primary); } .quick-tag.selected { background: linear-gradient(135deg, var(--ghost-primary), #7C3AED); color: white; border-color: transparent; } /* Upload Zone */ .upload-zone { border: 2px dashed var(--glass-border); border-radius: 16px; padding: 64px 48px; text-align: center; transition: all 0.3s ease; cursor: pointer; background: var(--bg-secondary); } .upload-zone:hover { border-color: var(--ghost-primary); background: rgba(139, 92, 246, 0.05); } .upload-zone.dragover { border-color: var(--ghost-primary); background: rgba(139, 92, 246, 0.1); box-shadow: 0 0 30px rgba(139, 92, 246, 0.2); } /* Responsive */ @media (max-width: 768px) { .upload-zone { padding: 48px 24px; } } ================================================ FILE: frontend/src/app/layout.tsx ================================================ import type { Metadata } from "next"; import { Inter } from "next/font/google"; import "./globals.css"; const inter = Inter({ subsets: ["latin"], variable: "--font-inter", }); export const metadata: Metadata = { title: "AudioGhost AI - AI-Powered Audio Separation", description: "Separate any sound from audio using natural language. Remove vocals, extract instruments, eliminate background noise with state-of-the-art AI.", keywords: ["audio separation", "AI", "vocal removal", "stem separation", "audio editing"], }; export default function RootLayout({ children, }: Readonly<{ children: React.ReactNode; }>) { return ( {children} ); } ================================================ FILE: frontend/src/app/page.tsx ================================================ "use client"; import { useState, useEffect } from "react"; import Header from "@/components/Header"; import AuthModal from "@/components/AuthModal"; import AudioUploader from "@/components/AudioUploader"; import WaveformEditor from "@/components/WaveformEditor"; import SeparationPanel from "@/components/SeparationPanel"; import ProgressTracker from "@/components/ProgressTracker"; import StemMixer from "@/components/StemMixer"; import VideoStemMixer from "@/components/VideoStemMixer"; interface TaskResult { original_path: string; ghost_path: string; clean_path: string; description: string; mode: string; audio_duration?: number; processing_time?: number; model_size?: string; video_path?: string; is_video?: boolean; } interface TaskState { taskId: string | null; status: "idle" | "pending" | "processing" | "completed" | "failed"; progress: number; message: string; result: TaskResult | null; } export default function Home() { const [isAuthenticated, setIsAuthenticated] = useState(false); const [showAuthModal, setShowAuthModal] = useState(false); const [audioFile, setAudioFile] = useState(null); const [audioUrl, setAudioUrl] = useState(null); const [isVideo, setIsVideo] = useState(false); const [isDarkMode, setIsDarkMode] = useState(true); const [selectedRegion, setSelectedRegion] = useState<{ start: number; end: number } | null>(null); // Persistent separation settings (won't reset on "New") const [separationSettings, setSeparationSettings] = useState({ modelSize: "base" as "small" | "base" | "large", chunkDuration: 25, useFloat32: false, }); const [task, setTask] = useState({ taskId: null, status: "idle", progress: 0, message: "", result: null, }); // Check auth status on mount useEffect(() => { checkAuthStatus(); }, []); // Toggle theme useEffect(() => { document.body.classList.toggle("light-mode", !isDarkMode); }, [isDarkMode]); const checkAuthStatus = async () => { try { const res = await fetch("http://localhost:8000/api/auth/status"); const data = await res.json(); setIsAuthenticated(data.authenticated); } catch (error) { console.error("Failed to check auth status:", error); } }; const handleFileUpload = (file: File) => { setAudioFile(file); const url = URL.createObjectURL(file); setAudioUrl(url); // Detect if file is video const isVideoFile = file.type.startsWith("video/") || /\.(mp4|webm|mov|avi|mkv)$/i.test(file.name); setIsVideo(isVideoFile); // Reset task state setTask({ taskId: null, status: "idle", progress: 0, message: "", result: null, }); }; const handleReset = () => { // Clean up the object URL to free memory if (audioUrl) { URL.revokeObjectURL(audioUrl); } setAudioFile(null); setAudioUrl(null); setIsVideo(false); setSelectedRegion(null); setTask({ taskId: null, status: "idle", progress: 0, message: "", result: null, }); }; const handleSeparation = async ( description: string, mode: "extract" | "remove", modelSize: string = "base", chunkDuration: number = 25, useFloat32: boolean = false ) => { if (!audioFile) return; const formData = new FormData(); formData.append("file", audioFile); formData.append("description", description); formData.append("mode", mode); formData.append("model_size", modelSize); formData.append("chunk_duration", chunkDuration.toString()); formData.append("use_float32", useFloat32.toString()); if (selectedRegion) { formData.append("start_time", selectedRegion.start.toString()); formData.append("end_time", selectedRegion.end.toString()); } try { const res = await fetch("http://localhost:8000/api/separate/", { method: "POST", body: formData, }); const data = await res.json(); setTask({ taskId: data.task_id, status: "pending", progress: 0, message: "Task submitted...", result: null, }); // Start polling for status pollTaskStatus(data.task_id); } catch (error) { console.error("Failed to submit separation task:", error); setTask(prev => ({ ...prev, status: "failed", message: "Failed to submit task", })); } }; const pollTaskStatus = async (taskId: string) => { const poll = async () => { try { const res = await fetch(`http://localhost:8000/api/tasks/${taskId}`); const data = await res.json(); setTask({ taskId, status: data.status, progress: data.progress, message: data.message || "", result: data.result || null, }); if (data.status !== "completed" && data.status !== "failed") { setTimeout(poll, 1000); } } catch (error) { console.error("Failed to poll task status:", error); } }; poll(); }; return (
setShowAuthModal(true)} isDarkMode={isDarkMode} onThemeToggle={() => setIsDarkMode(!isDarkMode)} onLogoClick={handleReset} />
{/* Hero Section */} {!audioUrl && (

AudioGhost{" "} AI

AI-Powered Object-Oriented Audio Separation

Describe the sound you want to extract or remove using natural language

)} {/* Main Content */}
{/* Upload Zone */} {!audioUrl && ( )} {/* Waveform Editor (Audio) or Video Preview - Hide when results are shown */} {audioUrl && task.status !== "completed" && ( <> {/* Section Header with Upload Button */}

{isVideo ? "Video Preview" : "Audio Editor"}

{/* Show Video Player or Waveform based on file type */} {isVideo ? (
) : ( )} )} {/* Separation Controls */} {audioUrl && task.status === "idle" && ( setShowAuthModal(true)} hasRegion={!!selectedRegion} settings={separationSettings} onSettingsChange={setSeparationSettings} /> )} {/* Progress Tracker */} {(task.status === "pending" || task.status === "processing") && ( )} {/* Results - Stem Mixer (Audio) or Video Stem Mixer */} {task.status === "completed" && task.result && task.taskId && ( task.result.is_video ? ( { setTask({ taskId: null, status: "idle", progress: 0, message: "", result: null, }); }} /> ) : ( { setTask({ taskId: null, status: "idle", progress: 0, message: "", result: null, }); }} /> ) )} {/* Error State */} {task.status === "failed" && (
❌ Separation Failed

{task.message}

)}
{/* Auth Modal */} {showAuthModal && ( setShowAuthModal(false)} onSuccess={() => { setIsAuthenticated(true); setShowAuthModal(false); }} /> )}
); } ================================================ FILE: frontend/src/components/AudioUploader.tsx ================================================ "use client"; import { useState, useRef, useCallback } from "react"; import { Upload, Music, Video } from "lucide-react"; interface AudioUploaderProps { onFileUpload: (file: File) => void; } export default function AudioUploader({ onFileUpload }: AudioUploaderProps) { const [isDragOver, setIsDragOver] = useState(false); const fileInputRef = useRef(null); const handleDragOver = useCallback((e: React.DragEvent) => { e.preventDefault(); setIsDragOver(true); }, []); const handleDragLeave = useCallback((e: React.DragEvent) => { e.preventDefault(); setIsDragOver(false); }, []); const handleDrop = useCallback((e: React.DragEvent) => { e.preventDefault(); setIsDragOver(false); const file = e.dataTransfer.files[0]; if (file && isMediaFile(file)) { onFileUpload(file); } }, [onFileUpload]); const handleFileSelect = (e: React.ChangeEvent) => { const file = e.target.files?.[0]; if (file && isMediaFile(file)) { onFileUpload(file); } }; const isMediaFile = (file: File) => { // Accept audio files if (file.type.startsWith("audio/") || /\.(mp3|wav|flac|ogg|m4a|aac)$/i.test(file.name)) { return true; } // Accept video files if (file.type.startsWith("video/") || /\.(mp4|webm|mov|avi|mkv)$/i.test(file.name)) { return true; } return false; }; return (
fileInputRef.current?.click()} >
{/* Icon */}
{isDragOver ? ( ) : ( )}
{/* Text */}

{isDragOver ? "Drop your file here" : "Upload Audio or Video"}

Drag & drop or click to browse

{/* Supported formats */}
{["MP3", "WAV", "FLAC", "MP4", "WebM", "MOV"].map((format) => ( {format} ))}
); } ================================================ FILE: frontend/src/components/AuthModal.tsx ================================================ "use client"; import { useState } from "react"; import { X, Key, ExternalLink, Loader2, CheckCircle } from "lucide-react"; interface AuthModalProps { onClose: () => void; onSuccess: () => void; } export default function AuthModal({ onClose, onSuccess }: AuthModalProps) { const [token, setToken] = useState(""); const [isLoading, setIsLoading] = useState(false); const [error, setError] = useState(""); const [step, setStep] = useState<"input" | "success">("input"); const handleSubmit = async (e: React.FormEvent) => { e.preventDefault(); setIsLoading(true); setError(""); try { const res = await fetch("http://localhost:8000/api/auth/login", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ token }), }); const data = await res.json(); if (!res.ok) { throw new Error(data.detail || "Authentication failed"); } setStep("success"); setTimeout(() => { onSuccess(); }, 1500); } catch (err) { setError(err instanceof Error ? err.message : "Authentication failed"); } finally { setIsLoading(false); } }; return (
e.stopPropagation()} > {/* Close Button */} {step === "input" ? ( <> {/* Header */}

Connect HuggingFace

Enter your HuggingFace token to access SAM Audio models

{/* Instructions */}

How to get your token:

  1. 1. Request access to{" "} SAM Audio on HuggingFace
  2. 2. Create an{" "} access token
  3. 3. Paste the token below
{/* Form */}
setToken(e.target.value)} placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxx" className="input-ghost font-mono text-sm" disabled={isLoading} />
{error && (
{error}
)}
) : ( /* Success State */

Connected!

You can now use SAM Audio models

)}
); } ================================================ FILE: frontend/src/components/Header.tsx ================================================ "use client"; import { Sun, Moon, Ghost, User, LogOut } from "lucide-react"; interface HeaderProps { isAuthenticated: boolean; onAuthClick: () => void; isDarkMode: boolean; onThemeToggle: () => void; onLogoClick?: () => void; } export default function Header({ isAuthenticated, onAuthClick, isDarkMode, onThemeToggle, onLogoClick }: HeaderProps) { return (
{/* Logo - Clickable */}
AudioGhost Logo

AudioGhost

v1.0 MVP

{/* Actions */}
{/* Theme Toggle */} {/* Auth Button */} {isAuthenticated ? (
Connected
) : ( )}
); } ================================================ FILE: frontend/src/components/ProgressTracker.tsx ================================================ "use client"; import { Ghost, Loader2, CheckCircle2 } from "lucide-react"; interface ProgressTrackerProps { status: "pending" | "processing"; progress: number; message: string; } export default function ProgressTracker({ status, progress, message }: ProgressTrackerProps) { const steps = [ { step: "Loading model", threshold: 10 }, { step: "Processing audio", threshold: 30 }, { step: "Running separation", threshold: 50 }, { step: "Saving results", threshold: 80 }, ]; return (
{/* Icon */}
{status === "pending" ? ( ) : ( )}
{/* Title */}

{status === "pending" ? "Waiting in Queue..." : "Processing Audio..."}

{/* Message */}

{message}

{/* Progress Bar */}
Progress {progress}%
{/* Steps */}
{steps.map(({ step, threshold }, index) => { const isComplete = progress >= threshold; const isActive = !isComplete && progress >= threshold - 20; return (
{/* Status Icon */}
{isComplete ? ( ) : isActive ? ( ) : null}
{/* Step Label */} {step}
); })}
); } ================================================ FILE: frontend/src/components/SeparationPanel.tsx ================================================ "use client"; import { useState } from "react"; import { Mic, Music, Volume2, Sparkles, Clock, AlertCircle, Cpu, Search, Sliders, Zap } from "lucide-react"; interface SeparationSettings { modelSize: "small" | "base" | "large"; chunkDuration: number; useFloat32: boolean; } interface SeparationPanelProps { onSeparate: (description: string, mode: "extract" | "remove", modelSize: string, chunkDuration: number, useFloat32: boolean) => void; isAuthenticated: boolean; onAuthRequired: () => void; hasRegion: boolean; // Persistent settings from parent settings: SeparationSettings; onSettingsChange: (settings: SeparationSettings) => void; } const QUICK_PROMPTS = [ { icon: Mic, label: "Voice", prompt: "singing voice", color: "#8B5CF6" }, { icon: Music, label: "Music", prompt: "background music", color: "#06B6D4" }, { icon: Volume2, label: "Drums", prompt: "drums and percussion", color: "#F472B6" }, { icon: Sparkles, label: "Guitar", prompt: "acoustic guitar", color: "#10B981" }, { icon: Volume2, label: "Bass", prompt: "bass", color: "#F59E0B" }, { icon: Volume2, label: "Piano", prompt: "piano", color: "#EF4444" }, ]; const MODEL_OPTIONS = [ { value: "small", label: "Small", vram: "~6GB", vramFp32: "~9GB", speed: "Fast" }, { value: "base", label: "Base", vram: "~7GB", vramFp32: "~10GB", speed: "Balanced" }, { value: "large", label: "Large", vram: "~10GB", vramFp32: "~13GB", speed: "Best" }, ] as const; export default function SeparationPanel({ onSeparate, isAuthenticated, onAuthRequired, hasRegion, settings, onSettingsChange }: SeparationPanelProps) { // Only prompt and mode are local (reset each time) const [customPrompt, setCustomPrompt] = useState(""); const [selectedPrompt, setSelectedPrompt] = useState(null); const [mode, setMode] = useState<"extract" | "remove">("extract"); // Destructure settings for easier access const { modelSize, chunkDuration, useFloat32 } = settings; // Helper to update a single setting const updateSetting = (key: K, value: SeparationSettings[K]) => { onSettingsChange({ ...settings, [key]: value }); }; const handleQuickSelect = (prompt: string) => { setSelectedPrompt(prompt); setCustomPrompt(prompt); }; const handleSeparate = () => { if (!isAuthenticated) { onAuthRequired(); return; } const prompt = customPrompt || selectedPrompt; if (!prompt) return; onSeparate(prompt, mode, modelSize, chunkDuration, useFloat32); }; const activePrompt = customPrompt || selectedPrompt; return (
{/* Header */}

Separation Settings

{hasRegion && (
Temporal Lock
)}
{/* ============================================ */} {/* MAIN INPUT - Describe the sound (PROMINENT) */} {/* ============================================ */}
{ setCustomPrompt(e.target.value); setSelectedPrompt(null); }} placeholder="e.g., singing voice, drums, police siren, crowd noise, a dog barking..." style={{ width: "100%", padding: "16px 18px", borderRadius: "12px", border: "2px solid var(--ghost-primary)", background: "var(--bg-primary)", color: "var(--text-primary)", fontSize: "1rem", fontWeight: 500, outline: "none", boxShadow: "0 4px 15px rgba(168, 85, 247, 0.15)" }} /> {/* Quick Select Tags */}
{QUICK_PROMPTS.map(({ icon: Icon, label, prompt, color }) => ( ))}
{/* Settings Grid */}
{/* Mode Toggle */}
{/* Model Selector */}
{MODEL_OPTIONS.map(({ value, label, vram, vramFp32 }) => ( ))}
{/* Float32 Precision Toggle */}
High Quality Mode (float32)
Better separation quality, +2-3GB VRAM
{/* Chunk Duration Slider */}
{chunkDuration}s
updateSetting("chunkDuration", Number(e.target.value))} style={{ width: "100%", height: "6px", borderRadius: "3px", background: `linear-gradient(to right, var(--ghost-primary) ${((chunkDuration - 5) / 55) * 100}%, var(--bg-secondary) ${((chunkDuration - 5) / 55) * 100}%)`, cursor: "pointer", appearance: "none", outline: "none" }} />
5s (Low VRAM) 60s (Fast)

⚡ Smaller chunks = Less VRAM usage, but slower processing & may affect quality at boundaries

{/* Auth Warning */} {!isAuthenticated && (

Connect HuggingFace to continue

)} {/* Action Button */}
); } ================================================ FILE: frontend/src/components/StemMixer.tsx ================================================ "use client"; import { useState, useRef, useEffect, useCallback } from "react"; import { Play, Pause, Download, Volume2, VolumeX, RefreshCw, Ghost, Leaf, SkipBack, Music, type LucideIcon } from "lucide-react"; import WaveSurfer from "wavesurfer.js"; interface StemMixerProps { taskId: string; description: string; onNewSeparation: () => void; onUploadNew?: () => void; audioDuration?: number; processingTime?: number; modelSize?: string; } interface Track { id: "ghost" | "clean"; label: string; icon: LucideIcon; color: string; waveColor: string; } const TRACKS: Track[] = [ { id: "ghost", label: "Isolated Sound", icon: Ghost, color: "#F472B6", waveColor: "#F472B6" }, { id: "clean", label: "Without Isolated Sound", icon: Leaf, color: "#60A5FA", waveColor: "#60A5FA" }, ]; export default function StemMixer({ taskId, description, onNewSeparation, onUploadNew, audioDuration, processingTime, modelSize }: StemMixerProps) { const [isPlaying, setIsPlaying] = useState(false); const [currentTime, setCurrentTime] = useState(0); const [duration, setDuration] = useState(0); const [muted, setMuted] = useState>({ ghost: false, clean: false, }); const [isReady, setIsReady] = useState>({ ghost: false, clean: false, }); const wavesurferRefs = useRef>({}); const containerRefs = useRef>({}); const isSeeking = useRef(false); const getAudioUrl = (trackId: string) => { return `http://localhost:8000/api/tasks/${taskId}/download/${trackId}`; }; useEffect(() => { const initWaveSurfers = async () => { for (const track of TRACKS) { const container = containerRefs.current[track.id]; if (!container) continue; if (wavesurferRefs.current[track.id]) { wavesurferRefs.current[track.id]?.destroy(); } const ws = WaveSurfer.create({ container, waveColor: `${track.waveColor}40`, progressColor: track.waveColor, cursorColor: "#ffffff", cursorWidth: 1, barWidth: 2, barGap: 2, barRadius: 2, height: 48, normalize: true, // All tracks are seekable - clicking any will sync all others interact: true, hideScrollbar: true, }); ws.load(getAudioUrl(track.id)); ws.on("ready", () => { setIsReady(prev => ({ ...prev, [track.id]: true })); if (track.id === "ghost") { setDuration(ws.getDuration()); } ws.setMuted(muted[track.id]); }); ws.on("audioprocess", () => { if (!isSeeking.current && track.id === "ghost") { setCurrentTime(ws.getCurrentTime()); } }); ws.on("finish", () => { // Only trigger finish logic from the "ghost" track (master) // to prevent multiple finish events causing desync if (track.id === "ghost") { setIsPlaying(false); setCurrentTime(0); // Seek all tracks back to start synchronized Object.values(wavesurferRefs.current).forEach(w => { if (w) { w.pause(); w.seekTo(0); } }); } }); // Sync seeking across all tracks - when ANY track is seeked ws.on("seeking", () => { if (isSeeking.current) return; // Prevent recursive seeking isSeeking.current = true; const progress = ws.getCurrentTime() / ws.getDuration(); // Sync all OTHER tracks to the same position Object.entries(wavesurferRefs.current).forEach(([id, w]) => { if (w && id !== track.id) { w.seekTo(progress); } }); setCurrentTime(ws.getCurrentTime()); // Reset seeking flag after a short delay setTimeout(() => { isSeeking.current = false; }, 50); }); wavesurferRefs.current[track.id] = ws; } }; initWaveSurfers(); return () => { // Use setTimeout to avoid AbortError when component unmounts during loading const refs = { ...wavesurferRefs.current }; setTimeout(() => { Object.values(refs).forEach(ws => { if (ws) { try { ws.destroy(); } catch { /* ignore AbortError */ } } }); }, 0); }; }, [taskId]); // Continuous sync effect - keep all tracks aligned to original during playback useEffect(() => { let animationFrameId: number; const syncInterval = 100; // Sync every 100ms let lastSync = 0; const syncTracks = (timestamp: number) => { if (isPlaying && timestamp - lastSync > syncInterval) { lastSync = timestamp; const originalWs = wavesurferRefs.current["original"]; if (originalWs) { const masterTime = originalWs.getCurrentTime(); const masterDuration = originalWs.getDuration(); const progress = masterTime / masterDuration; // Check if original finished if (masterTime >= masterDuration - 0.1) { // Stop all tracks Object.values(wavesurferRefs.current).forEach(ws => ws?.pause()); Object.values(wavesurferRefs.current).forEach(ws => ws?.seekTo(0)); setIsPlaying(false); setCurrentTime(0); return; } // Sync other tracks to master Object.entries(wavesurferRefs.current).forEach(([id, ws]) => { if (ws && id !== "original") { const trackTime = ws.getCurrentTime(); // Only sync if drift is more than 0.05 seconds if (Math.abs(trackTime - masterTime) > 0.05) { ws.seekTo(progress); } } }); setCurrentTime(masterTime); } } animationFrameId = requestAnimationFrame(syncTracks); }; if (isPlaying) { animationFrameId = requestAnimationFrame(syncTracks); } return () => { if (animationFrameId) { cancelAnimationFrame(animationFrameId); } }; }, [isPlaying]); // Sync play/pause across all tracks const togglePlayAll = useCallback(() => { const allReady = Object.values(isReady).every(r => r); if (!allReady) return; if (isPlaying) { // Pause all Object.values(wavesurferRefs.current).forEach(ws => ws?.pause()); setIsPlaying(false); } else { // First sync all tracks to the same position const originalWs = wavesurferRefs.current["original"]; if (originalWs) { const progress = originalWs.getCurrentTime() / originalWs.getDuration(); Object.entries(wavesurferRefs.current).forEach(([id, ws]) => { if (ws && id !== "original") { ws.seekTo(progress); } }); } // Then play all together Object.values(wavesurferRefs.current).forEach(ws => ws?.play()); setIsPlaying(true); } }, [isPlaying, isReady]); const resetToStart = useCallback(() => { Object.values(wavesurferRefs.current).forEach(ws => { if (ws) { ws.pause(); ws.seekTo(0); } }); setIsPlaying(false); setCurrentTime(0); }, []); const toggleMute = useCallback((trackId: string) => { const ws = wavesurferRefs.current[trackId]; if (ws) { const newMuted = !muted[trackId]; ws.setMuted(newMuted); setMuted(prev => ({ ...prev, [trackId]: newMuted })); } }, [muted]); const handleSeek = useCallback((e: React.MouseEvent) => { const rect = e.currentTarget.getBoundingClientRect(); const progress = Math.max(0, Math.min(1, (e.clientX - rect.left) / rect.width)); isSeeking.current = true; // Seek all tracks to the same position Object.values(wavesurferRefs.current).forEach(ws => { if (ws) ws.seekTo(progress); }); setCurrentTime(progress * duration); isSeeking.current = false; }, [duration]); const downloadTrack = (trackId: string, label: string) => { const link = document.createElement("a"); link.href = getAudioUrl(trackId); link.download = `${taskId}_${label.toLowerCase().replace(/\s+/g, "_")}.wav`; document.body.appendChild(link); link.click(); document.body.removeChild(link); }; const formatTime = (seconds: number) => { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, "0")}`; }; const allReady = Object.values(isReady).every(r => r); return (
{/* Header */}

✨ Separation Complete

"{description}"

{onUploadNew && ( )}
{/* Stats Bar */} {(audioDuration || processingTime || modelSize) && (
{audioDuration !== undefined && (
Audio: {Math.floor(audioDuration / 60)}:{(audioDuration % 60).toFixed(0).padStart(2, "0")}
)} {processingTime !== undefined && (
Processing: {processingTime.toFixed(1)}s
)} {modelSize && (
Model: {modelSize}
)} {audioDuration && processingTime && (
Speed: {(audioDuration / processingTime).toFixed(1)}x
)}
)} {/* Transport Controls */}
{formatTime(currentTime)}
0 ? (currentTime / duration) * 100 : 0}%`, transition: "width 0.1s" }} />
{formatTime(duration)}
{/* Tracks */}
{TRACKS.map((track) => { const TrackIcon = track.icon; const isMuted = muted[track.id]; const trackReady = isReady[track.id]; return (
{/* Mute Button */} {/* Track Label */}
{track.label}
{/* Waveform */}
{ containerRefs.current[track.id] = el; }} style={{ flex: 1, borderRadius: "8px", overflow: "hidden", background: "var(--bg-secondary)", minHeight: "48px" }} > {!trackReady && (
Loading...
)}
{/* Download */}
); })}
{/* Download All */}
); } ================================================ FILE: frontend/src/components/VideoStemMixer.tsx ================================================ "use client"; import { useState, useRef, useEffect, useCallback } from "react"; import { Play, Pause, Download, Volume2, VolumeX, RefreshCw, Ghost, Leaf, SkipBack, Film, type LucideIcon } from "lucide-react"; import WaveSurfer from "wavesurfer.js"; interface VideoStemMixerProps { taskId: string; description: string; onNewSeparation: () => void; onUploadNew?: () => void; audioDuration?: number; processingTime?: number; modelSize?: string; } interface Track { id: "ghost" | "clean"; label: string; icon: LucideIcon; color: string; waveColor: string; } const TRACKS: Track[] = [ { id: "ghost", label: "Isolated Sound", icon: Ghost, color: "#F472B6", waveColor: "#F472B6" }, { id: "clean", label: "Without Isolated Sound", icon: Leaf, color: "#60A5FA", waveColor: "#60A5FA" }, ]; export default function VideoStemMixer({ taskId, description, onNewSeparation, onUploadNew, audioDuration, processingTime, modelSize }: VideoStemMixerProps) { const [isPlaying, setIsPlaying] = useState(false); const [currentTime, setCurrentTime] = useState(0); const [duration, setDuration] = useState(0); const [muted, setMuted] = useState>({ ghost: false, clean: false, }); const [videoMuted, setVideoMuted] = useState(true); const [showVideoDownload, setShowVideoDownload] = useState(false); const [isReady, setIsReady] = useState>({ video: false, ghost: false, clean: false, }); const videoRef = useRef(null); const wavesurferRefs = useRef>({}); const containerRefs = useRef>({}); const isSeeking = useRef(false); const getAudioUrl = (trackId: string) => { return `http://localhost:8000/api/tasks/${taskId}/download/${trackId}`; }; const getVideoUrl = () => { return `http://localhost:8000/api/tasks/${taskId}/download/video`; }; // Initialize video and wavesurfers useEffect(() => { const initWaveSurfers = async () => { for (const track of TRACKS) { const container = containerRefs.current[track.id]; if (!container) continue; if (wavesurferRefs.current[track.id]) { wavesurferRefs.current[track.id]?.destroy(); } const ws = WaveSurfer.create({ container, waveColor: `${track.waveColor}40`, progressColor: track.waveColor, cursorColor: "#ffffff", cursorWidth: 1, barWidth: 2, barGap: 2, barRadius: 2, height: 48, normalize: true, interact: true, hideScrollbar: true, }); ws.load(getAudioUrl(track.id)); ws.on("ready", () => { setIsReady(prev => ({ ...prev, [track.id]: true })); if (track.id === "ghost") { setDuration(ws.getDuration()); } ws.setMuted(muted[track.id]); }); // When user clicks on waveform - sync video and other tracks // Use 'interaction' event to detect actual user clicks vs programmatic seeks let isUserInteracting = false; ws.on("interaction", () => { isUserInteracting = true; }); ws.on("seeking", () => { // Only handle user-initiated seeks, not programmatic ones if (!isUserInteracting) return; isUserInteracting = false; if (isSeeking.current) return; isSeeking.current = true; const progress = ws.getCurrentTime() / ws.getDuration(); const newTime = ws.getCurrentTime(); // Sync video if (videoRef.current) { videoRef.current.currentTime = newTime; } // Sync other audio tracks Object.entries(wavesurferRefs.current).forEach(([id, w]) => { if (w && id !== track.id) { w.seekTo(progress); } }); setCurrentTime(newTime); // Reset seeking flag after short delay setTimeout(() => { isSeeking.current = false; }, 150); }); wavesurferRefs.current[track.id] = ws; } }; initWaveSurfers(); return () => { const refs = { ...wavesurferRefs.current }; setTimeout(() => { Object.values(refs).forEach(ws => { if (ws) { try { ws.destroy(); } catch { /* ignore */ } } }); }, 0); }; }, [taskId]); // Handle video events useEffect(() => { const video = videoRef.current; if (!video) return; const handleTimeUpdate = () => { if (!isSeeking.current) { setCurrentTime(video.currentTime); } }; const handleLoadedMetadata = () => { setIsReady(prev => ({ ...prev, video: true })); setDuration(video.duration); }; const handleEnded = () => { setIsPlaying(false); setCurrentTime(0); // Pause all audio tracks when video ends Object.values(wavesurferRefs.current).forEach(w => { if (w) { w.pause(); w.seekTo(0); } }); }; // When video starts seeking (user dragging video scrubber) const handleSeeking = () => { isSeeking.current = true; }; // When video finishes seeking - sync all audio tracks to video position const handleSeeked = () => { const progress = video.currentTime / video.duration; // Sync all audio tracks to the new video position Object.values(wavesurferRefs.current).forEach(ws => { if (ws) { ws.seekTo(progress); } }); setCurrentTime(video.currentTime); // Small delay before allowing new syncs setTimeout(() => { isSeeking.current = false; }, 100); }; video.addEventListener("timeupdate", handleTimeUpdate); video.addEventListener("loadedmetadata", handleLoadedMetadata); video.addEventListener("ended", handleEnded); video.addEventListener("seeking", handleSeeking); video.addEventListener("seeked", handleSeeked); return () => { video.removeEventListener("timeupdate", handleTimeUpdate); video.removeEventListener("loadedmetadata", handleLoadedMetadata); video.removeEventListener("ended", handleEnded); video.removeEventListener("seeking", handleSeeking); video.removeEventListener("seeked", handleSeeked); }; }, []); // Continuous sync effect - keeps audio tracks aligned with video during playback useEffect(() => { let animationFrameId: number; const syncInterval = 150; // Check less frequently let lastSync = 0; const syncTracks = (timestamp: number) => { // Skip sync during seeking to prevent interference if (isPlaying && !isSeeking.current && timestamp - lastSync > syncInterval) { lastSync = timestamp; const video = videoRef.current; if (video && !video.seeking) { const masterTime = video.currentTime; const masterDuration = video.duration; const progress = masterTime / masterDuration; // Sync audio tracks to video only if drift is significant Object.values(wavesurferRefs.current).forEach(ws => { if (ws) { const trackTime = ws.getCurrentTime(); // Only sync if drift is more than 0.15 seconds if (Math.abs(trackTime - masterTime) > 0.15) { ws.seekTo(progress); } } }); setCurrentTime(masterTime); } } animationFrameId = requestAnimationFrame(syncTracks); }; if (isPlaying) { animationFrameId = requestAnimationFrame(syncTracks); } return () => { if (animationFrameId) { cancelAnimationFrame(animationFrameId); } }; }, [isPlaying]); const togglePlayAll = useCallback(() => { const allReady = Object.values(isReady).every(r => r); if (!allReady) return; const video = videoRef.current; if (!video) return; if (isPlaying) { video.pause(); Object.values(wavesurferRefs.current).forEach(ws => ws?.pause()); setIsPlaying(false); } else { // Sync all tracks to video position first const progress = video.currentTime / video.duration; Object.values(wavesurferRefs.current).forEach(ws => { if (ws) ws.seekTo(progress); }); video.play(); Object.values(wavesurferRefs.current).forEach(ws => ws?.play()); setIsPlaying(true); } }, [isPlaying, isReady]); const resetToStart = useCallback(() => { const video = videoRef.current; if (video) { video.pause(); video.currentTime = 0; } Object.values(wavesurferRefs.current).forEach(ws => { if (ws) { ws.pause(); ws.seekTo(0); } }); setIsPlaying(false); setCurrentTime(0); }, []); const toggleMute = useCallback((trackId: string) => { const ws = wavesurferRefs.current[trackId]; if (ws) { const newMuted = !muted[trackId]; ws.setMuted(newMuted); setMuted(prev => ({ ...prev, [trackId]: newMuted })); } }, [muted]); const handleSeek = useCallback((e: React.MouseEvent) => { const rect = e.currentTarget.getBoundingClientRect(); const progress = Math.max(0, Math.min(1, (e.clientX - rect.left) / rect.width)); const newTime = progress * duration; isSeeking.current = true; // Seek video - video's 'seeked' event will reset isSeeking and sync audio if (videoRef.current) { videoRef.current.currentTime = newTime; } // Also seek audio tracks immediately for visual feedback Object.values(wavesurferRefs.current).forEach(ws => { if (ws) ws.seekTo(progress); }); setCurrentTime(newTime); // Note: isSeeking is reset by video's 'seeked' event }, [duration]); const downloadTrack = (trackId: string, label: string) => { const link = document.createElement("a"); link.href = getAudioUrl(trackId); link.download = `${taskId}_${label.toLowerCase().replace(/\s+/g, "_")}.wav`; document.body.appendChild(link); link.click(); document.body.removeChild(link); }; const downloadVideoWithAudio = (audioType: "original" | "ghost" | "clean") => { const link = document.createElement("a"); link.href = `http://localhost:8000/api/tasks/${taskId}/download-video-with-audio/${audioType}`; const labels = { original: "original", ghost: "isolated", clean: "without_isolated" }; link.download = `${taskId}_${labels[audioType]}_video.mp4`; document.body.appendChild(link); link.click(); document.body.removeChild(link); setShowVideoDownload(false); }; const formatTime = (seconds: number) => { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, "0")}`; }; const allReady = Object.values(isReady).every(r => r); return (
{/* Header */}

Video Separation Complete

"{description}"

{onUploadNew && ( )}
{/* Video Player */}

{videoMuted ? "Video is muted. Audio plays from separated stems below." : "Playing original video audio. Stem audio may overlap."}

{/* Stats Bar */} {(audioDuration || processingTime || modelSize) && (
{audioDuration !== undefined && (
Duration: {Math.floor(audioDuration / 60)}:{(audioDuration % 60).toFixed(0).padStart(2, "0")}
)} {processingTime !== undefined && (
Processing: {processingTime.toFixed(1)}s
)} {modelSize && (
Model: {modelSize}
)}
)} {/* Transport Controls */}
{formatTime(currentTime)}
0 ? (currentTime / duration) * 100 : 0}%`, transition: "width 0.1s" }} />
{formatTime(duration)}
{/* Audio Tracks */}
{TRACKS.map((track) => { const TrackIcon = track.icon; const isMuted = muted[track.id]; const trackReady = isReady[track.id]; return (
{/* Mute Button */} {/* Track Label */}
{track.label}
{/* Waveform */}
{ containerRefs.current[track.id] = el; }} style={{ flex: 1, borderRadius: "8px", overflow: "hidden", background: "var(--bg-secondary)", minHeight: "48px" }} > {!trackReady && (
Loading...
)}
{/* Download */}
); })}
{/* Download All */}
{/* Download Video with Audio */}
{/* Inline Options */} {showVideoDownload && (
)}
); } ================================================ FILE: frontend/src/components/WaveformEditor.tsx ================================================ "use client"; import { useEffect, useRef, useState } from "react"; import WaveSurfer from "wavesurfer.js"; import RegionsPlugin from "wavesurfer.js/dist/plugins/regions.js"; import { Play, Pause, RotateCcw, Scissors, X } from "lucide-react"; interface WaveformEditorProps { audioUrl: string; onRegionSelect: (region: { start: number; end: number } | null) => void; selectedRegion: { start: number; end: number } | null; } export default function WaveformEditor({ audioUrl, onRegionSelect, selectedRegion }: WaveformEditorProps) { const containerRef = useRef(null); const wavesurferRef = useRef(null); const regionsRef = useRef(null); const [isPlaying, setIsPlaying] = useState(false); const [currentTime, setCurrentTime] = useState(0); const [duration, setDuration] = useState(0); const [isLoaded, setIsLoaded] = useState(false); useEffect(() => { if (!containerRef.current) return; let isMounted = true; // Create regions plugin const regions = RegionsPlugin.create(); regionsRef.current = regions; // Create wavesurfer instance const wavesurfer = WaveSurfer.create({ container: containerRef.current, waveColor: "rgba(139, 92, 246, 0.5)", progressColor: "#8B5CF6", cursorColor: "#F472B6", cursorWidth: 2, barWidth: 3, barGap: 2, barRadius: 3, height: 128, normalize: true, plugins: [regions], }); wavesurferRef.current = wavesurfer; // Event listeners wavesurfer.on("ready", () => { if (isMounted) { setDuration(wavesurfer.getDuration()); setIsLoaded(true); } }); wavesurfer.on("audioprocess", () => { if (isMounted) { setCurrentTime(wavesurfer.getCurrentTime()); } }); wavesurfer.on("seeking", () => { if (isMounted) { setCurrentTime(wavesurfer.getCurrentTime()); } }); wavesurfer.on("play", () => isMounted && setIsPlaying(true)); wavesurfer.on("pause", () => isMounted && setIsPlaying(false)); // Region events regions.on("region-created", (region) => { // Only allow one region at a time regions.getRegions().forEach((r) => { if (r.id !== region.id) { r.remove(); } }); if (isMounted) { onRegionSelect({ start: region.start, end: region.end, }); } }); regions.on("region-updated", (region) => { if (isMounted) { onRegionSelect({ start: region.start, end: region.end, }); } }); // Catch loading errors silently (including AbortError when component unmounts) wavesurfer.on("error", (error) => { // Silently ignore AbortError - this happens when unmounting during load if (error?.name === "AbortError" || String(error).includes("abort")) { return; } console.warn("WaveSurfer error:", error); }); // Load audio wavesurfer.load(audioUrl).catch((error) => { // Silently ignore AbortError if (error?.name === "AbortError" || String(error).includes("abort")) { return; } console.warn("Failed to load audio:", error); }); return () => { isMounted = false; // Use setTimeout to ensure any pending operations complete // before attempting destruction const ws = wavesurfer; setTimeout(() => { try { ws.destroy(); } catch { // Ignore AbortError and other destruction errors // This happens when component unmounts during audio loading } }, 0); }; }, [audioUrl]); const togglePlayPause = () => { wavesurferRef.current?.playPause(); }; const restart = () => { wavesurferRef.current?.seekTo(0); wavesurferRef.current?.play(); }; const createRegion = () => { if (!regionsRef.current || !wavesurferRef.current) return; const currentPos = wavesurferRef.current.getCurrentTime(); const dur = wavesurferRef.current.getDuration(); // Create a region from current position to +5 seconds const start = currentPos; const end = Math.min(currentPos + 5, dur); regionsRef.current.addRegion({ start, end, color: "rgba(244, 114, 182, 0.3)", drag: true, resize: true, }); }; const clearRegion = () => { regionsRef.current?.getRegions().forEach((r) => r.remove()); onRegionSelect(null); }; const formatTime = (seconds: number) => { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, "0")}`; }; return (
{/* Header */}

Waveform Editor

{selectedRegion && ( Selected: {formatTime(selectedRegion.start)} - {formatTime(selectedRegion.end)} )}
{/* Waveform */}
{/* Loading skeleton */} {!isLoaded && (
)} {/* Controls */}
{/* Playback Controls */}
{formatTime(currentTime)} / {formatTime(duration)}
{/* Region Controls */}
{selectedRegion && ( )}
{/* Instructions */}

💡 Tip: Select a region to apply Temporal Lock - the AI will focus on that specific time range.

); } ================================================ FILE: frontend/tsconfig.json ================================================ { "compilerOptions": { "target": "ES2017", "lib": ["dom", "dom.iterable", "esnext"], "allowJs": true, "skipLibCheck": true, "strict": true, "noEmit": true, "esModuleInterop": true, "module": "esnext", "moduleResolution": "bundler", "resolveJsonModule": true, "isolatedModules": true, "jsx": "react-jsx", "incremental": true, "plugins": [ { "name": "next" } ], "paths": { "@/*": ["./src/*"] } }, "include": [ "next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts", ".next/dev/types/**/*.ts", "**/*.mts" ], "exclude": ["node_modules"] } ================================================ FILE: install.bat ================================================ @echo off chcp 65001 >nul title AudioGhost AI - One-Click Installer echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ AudioGhost AI - One-Click Installer ║ echo ║ v1.0 MVP ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. :: Get the directory where this script is located set "SCRIPT_DIR=%~dp0" cd /d "%SCRIPT_DIR%" :: Check if Conda is installed where conda >nul 2>&1 if %errorlevel% neq 0 ( echo [ERROR] Conda not found. Please install Anaconda or Miniconda first. echo Download from: https://www.anaconda.com/download pause exit /b 1 ) echo [1/8] Downloading Redis for Windows... if not exist "redis\redis-server.exe" ( echo Downloading from GitHub... powershell -Command "Invoke-WebRequest -Uri 'https://github.com/tporadowski/redis/releases/download/v5.0.14.1/Redis-x64-5.0.14.1.zip' -OutFile 'redis.zip'" echo Extracting... powershell -Command "Expand-Archive -Path 'redis.zip' -DestinationPath 'redis' -Force" del redis.zip echo Redis installed to ./redis/ ) else ( echo Redis already exists, skipping... ) echo. echo [2/8] Creating Conda environment 'audioghost' (Python 3.11)... call conda create -n audioghost python=3.11 -y if %errorlevel% neq 0 ( echo [WARN] Environment may already exist, continuing... ) echo. echo [3/8] Activating environment... call conda activate audioghost echo. echo [4/8] Installing PyTorch (CUDA 12.6)... echo This may take several minutes... pip install torch==2.9.0+cu126 torchvision==0.24.0+cu126 torchaudio==2.9.0+cu126 --index-url https://download.pytorch.org/whl/cu126 --extra-index-url https://pypi.org/simple echo. echo [5/8] Installing FFmpeg... call conda install -c conda-forge ffmpeg -y echo. echo [6/8] Installing SAM Audio... pip install git+https://github.com/facebookresearch/sam-audio.git echo. echo [7/8] Installing Backend dependencies... cd backend pip install -r requirements.txt cd .. echo. echo [8/8] Installing Frontend dependencies... cd frontend call npm install cd .. echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ Installation Complete! ✓ ║ echo ╠══════════════════════════════════════════════════════════════╣ echo ║ ║ echo ║ To start AudioGhost, run: start.bat ║ echo ║ ║ echo ║ No Docker required! Redis is included. ║ echo ║ ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. pause ================================================ FILE: sam_audio_lite.py ================================================ """ SAM Audio Lite - Lightweight version for low VRAM usage (4-6GB) Disables vision_encoder, rankers, and span_predictor to reduce memory WITH CHUNKING - Same logic as Celery worker for memory comparison """ import torch import gc import time def show_gpu_memory(label: str = ""): """Show complete GPU memory stats (matches nvidia-smi more closely)""" if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated() / 1024**3 reserved = torch.cuda.memory_reserved() / 1024**3 max_allocated = torch.cuda.max_memory_allocated() / 1024**3 print(f"[GPU Memory{' - ' + label if label else ''}] " f"Allocated: {allocated:.2f}GB | Reserved: {reserved:.2f}GB | Peak: {max_allocated:.2f}GB") def create_lite_model(model_name: str = "facebook/sam-audio-base", token: str = None): """ Create a memory-optimized SAM Audio model by removing unused components. This can reduce VRAM usage from ~11GB to ~4GB by: - Replacing vision_encoder with a dummy (saves ~2GB) - Disabling visual_ranker (saves ~2GB) - Disabling text_ranker (saves ~2GB) - Disabling span_predictor (saves ~1-2GB) Returns: model: Optimized SAM Audio model processor: SAM Audio processor """ from sam_audio import SAMAudio, SAMAudioProcessor print(f"Loading {model_name}...") # Load model if token: model = SAMAudio.from_pretrained(model_name, token=token) else: model = SAMAudio.from_pretrained(model_name) processor = SAMAudioProcessor.from_pretrained(model_name) print("Optimizing model for low VRAM...") # Get vision encoder dim before deleting vision_dim = model.vision_encoder.dim if hasattr(model.vision_encoder, 'dim') else 1024 # Delete heavy components del model.vision_encoder gc.collect() print(" - Removed vision_encoder") # Store the dim for _get_video_features model._vision_encoder_dim = vision_dim # Replace _get_video_features to not use vision_encoder def _get_video_features_lite(self, video, audio_features): B, T, _ = audio_features.shape # Always return zeros since we're not using video return audio_features.new_zeros(B, self._vision_encoder_dim, T) # Bind the new method import types model._get_video_features = types.MethodType(_get_video_features_lite, model) # Delete rankers if hasattr(model, 'visual_ranker') and model.visual_ranker is not None: del model.visual_ranker model.visual_ranker = None gc.collect() print(" - Removed visual_ranker") if hasattr(model, 'text_ranker') and model.text_ranker is not None: del model.text_ranker model.text_ranker = None gc.collect() print(" - Removed text_ranker") # Delete span predictor if hasattr(model, 'span_predictor') and model.span_predictor is not None: del model.span_predictor model.span_predictor = None gc.collect() print(" - Removed span_predictor") if hasattr(model, 'span_predictor_transform') and model.span_predictor_transform is not None: del model.span_predictor_transform model.span_predictor_transform = None gc.collect() print(" - Removed span_predictor_transform") # Force garbage collection gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print("Model optimization complete!") return model, processor if __name__ == "__main__": import torchaudio from pathlib import Path # ===================================== # CONFIGURATION # ===================================== USE_BF16 = True # Set to False to use float32 (better quality, more VRAM) # ===================================== # Setup device = 'cuda' if torch.cuda.is_available() else 'cpu' dtype = torch.bfloat16 if USE_BF16 else torch.float32 print(f"Using device: {device}, dtype: {dtype}") print(f"USE_BF16: {USE_BF16}") # Clear GPU memory and reset peak stats if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() gc.collect() show_gpu_memory("Before loading model") # Create lite model model, processor = create_lite_model() model = model.eval().to(device, dtype) show_gpu_memory("After loading model") # Test audio test_audio = "test_chunk.mp3" description = "singing voice" if not Path(test_audio).exists(): print(f"\nNo test file at {test_audio}, creating test tone...") sample_rate = 16000 duration = 10 t = torch.linspace(0, duration, int(sample_rate * duration)) audio = 0.5 * torch.sin(2 * 3.14159 * 200 * t) audio = audio.unsqueeze(0) test_audio = "test_audio.wav" torchaudio.save(test_audio, audio, sample_rate) print(f"\nProcessing: {test_audio}") print(f"Description: '{description}'") # Start timing start_time = time.time() # Load audio sample_rate = processor.audio_sampling_rate audio, orig_sr = torchaudio.load(test_audio) if orig_sr != sample_rate: resampler = torchaudio.transforms.Resample(orig_sr, sample_rate) audio = resampler(audio) # Convert to mono if stereo if audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True) # Calculate audio duration audio_duration = audio.shape[1] / sample_rate print(f"\nAudio duration: {audio_duration:.2f}s") # =============================== # CHUNKING LOGIC (same as worker) # =============================== CHUNK_DURATION = 25.0 # 25 seconds per chunk MAX_CHUNK_SAMPLES = int(sample_rate * CHUNK_DURATION) if audio.shape[1] > MAX_CHUNK_SAMPLES: print(f"Audio is {audio_duration:.1f}s, using chunking ({CHUNK_DURATION}s chunks)") # Split audio into chunks audio_tensor = audio.squeeze(0).to(device, dtype) chunks = torch.split(audio_tensor, MAX_CHUNK_SAMPLES, dim=-1) total_chunks = len(chunks) out_target = [] out_residual = [] show_gpu_memory("Before chunked separation") for i, chunk in enumerate(chunks): print(f"\nProcessing chunk {i+1}/{total_chunks}...") # Skip very short chunks if chunk.shape[-1] < sample_rate: # Less than 1 second print(f" Skipping chunk {i+1} (too short: {chunk.shape[-1]/sample_rate:.2f}s)") continue # Prepare batch for this chunk batch = processor( audios=[chunk.unsqueeze(0)], descriptions=[description] ).to(device) # Run separation with torch.inference_mode(): with torch.cuda.amp.autocast(enabled=(device == "cuda")): result = model.separate( batch, predict_spans=False, reranking_candidates=1 ) out_target.append(result.target[0].cpu()) out_residual.append(result.residual[0].cpu()) show_gpu_memory(f"After chunk {i+1}") # Clean up chunk results del batch, result if torch.cuda.is_available(): torch.cuda.empty_cache() # Concatenate all chunks target_audio = torch.cat(out_target, dim=-1).clamp(-1, 1).float().unsqueeze(0) residual_audio = torch.cat(out_residual, dim=-1).clamp(-1, 1).float().unsqueeze(0) del out_target, out_residual, chunks, audio_tensor else: print(f"Audio is {audio_duration:.1f}s, processing as single batch (no chunking needed)") # Prepare inputs print("\nPreparing batch...") batch = processor( audios=[test_audio], descriptions=[description], ).to(device) # Run separation print("Running separation...") show_gpu_memory("Before separation") with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype): result = model.separate( batch, predict_spans=False, reranking_candidates=1 ) show_gpu_memory("After separation") target_audio = result.target[0].float().unsqueeze(0).cpu() residual_audio = result.residual[0].float().unsqueeze(0).cpu() del batch, result # Calculate processing time processing_time = time.time() - start_time # Save results print("\n" + "="*50) print("Saving results...") torchaudio.save("output_target.wav", target_audio, sample_rate) torchaudio.save("output_residual.wav", residual_audio, sample_rate) # Cleanup del target_audio, residual_audio if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() show_gpu_memory("After cleanup") # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"Audio duration: {audio_duration:.2f}s ({audio_duration/60:.2f} min)") print(f"Processing time: {processing_time:.2f}s") print(f"Speed: {audio_duration/processing_time:.2f}x realtime") print(f"Chunking: {'Yes (' + str(len(chunks) if 'chunks' in dir() else '?') + ' chunks)' if audio_duration > CHUNK_DURATION else 'No (single batch)'}") if torch.cuda.is_available(): peak_gb = torch.cuda.max_memory_allocated() / 1024**3 reserved_gb = torch.cuda.max_memory_reserved() / 1024**3 print(f"Peak GPU Memory: {peak_gb:.2f}GB (reserved: {reserved_gb:.2f}GB)") print("="*60) print("Output files:") print(" - output_target.wav: Extracted audio") print(" - output_residual.wav: Residual audio") ================================================ FILE: start.bat ================================================ @echo off chcp 65001 >nul title AudioGhost AI - Launcher echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ AudioGhost AI - Launcher ║ echo ║ v1.0 MVP ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. :: Get the directory where this script is located set "SCRIPT_DIR=%~dp0" cd /d "%SCRIPT_DIR%" :: Check if Docker Redis is already running on port 6379 echo [1/4] Checking Redis... netstat -an | findstr ":6379.*LISTENING" >nul 2>&1 if %ERRORLEVEL%==0 ( echo Docker Redis detected - using existing instance goto :redis_ready ) :: Docker Redis not running, try offline Redis if exist "redis\redis-server.exe" ( echo Starting offline Redis... start "AudioGhost Redis" /min cmd /c "cd /d %SCRIPT_DIR%redis && redis-server.exe" timeout /t 2 /nobreak >nul goto :redis_ready ) :: Neither available echo [ERROR] Redis not available. Either: echo - Start Docker: docker-compose up -d echo - Or run install.bat to download offline Redis pause exit /b 1 :redis_ready echo [2/4] Starting Backend API... start "AudioGhost Backend" cmd /k "cd /d %SCRIPT_DIR% && conda activate audioghost && cd backend && uvicorn main:app --reload --port 8000" echo [3/4] Starting Celery Worker... timeout /t 2 /nobreak >nul start "AudioGhost Worker" cmd /k "cd /d %SCRIPT_DIR% && conda activate audioghost && cd backend && celery -A workers.celery_app worker --loglevel=info --pool=solo" echo [4/4] Starting Frontend... timeout /t 2 /nobreak >nul start "AudioGhost Frontend" cmd /k "cd /d %SCRIPT_DIR% && cd frontend && npm run dev" echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ All Services Started! ✓ ║ echo ╠══════════════════════════════════════════════════════════════╣ echo ║ ║ echo ║ Frontend: http://localhost:3000 ║ echo ║ Backend: http://localhost:8000 ║ echo ║ API Docs: http://localhost:8000/docs ║ echo ║ ║ echo ║ Four windows opened: ║ echo ║ - AudioGhost Redis (minimized) ║ echo ║ - AudioGhost Backend (FastAPI) ║ echo ║ - AudioGhost Worker (Celery) ║ echo ║ - AudioGhost Frontend (Next.js) ║ echo ║ ║ echo ║ Run stop.bat or close all windows to stop services. ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. echo Opening browser in 3 seconds... timeout /t 3 /nobreak >nul :: Open browser automatically start http://localhost:3000 ================================================ FILE: stop.bat ================================================ @echo off chcp 65001 >nul title AudioGhost AI - Stop All Services echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ AudioGhost AI - Shutdown ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. echo Stopping all AudioGhost services... echo. :: Kill Node.js (Frontend) echo [1/4] Stopping Frontend... taskkill /FI "WINDOWTITLE eq AudioGhost Frontend*" /F >nul 2>&1 taskkill /IM "node.exe" /F >nul 2>&1 :: Kill Celery (Worker) echo [2/4] Stopping Celery Worker... taskkill /FI "WINDOWTITLE eq AudioGhost Worker*" /F >nul 2>&1 :: Kill Uvicorn (Backend) echo [3/4] Stopping Backend API... taskkill /FI "WINDOWTITLE eq AudioGhost Backend*" /F >nul 2>&1 :: Stop Redis echo [4/4] Stopping Redis... docker-compose down >nul 2>&1 echo. echo ╔══════════════════════════════════════════════════════════════╗ echo ║ All Services Stopped! ✓ ║ echo ╚══════════════════════════════════════════════════════════════╝ echo. pause ================================================ FILE: test_video_only.py ================================================ """ Test Video Only - Simple video-based audio separation using SAM Audio Uses small model with bfloat16 for lower memory usage """ import torch import torchaudio import gc def main(): # Setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 if device.type == "cuda" else torch.float32 print(f"Device: {device}, dtype: {dtype}") # Clear GPU memory if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() print(f"GPU Memory before loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") # Load model (small for lower memory) from sam_audio import SAMAudio, SAMAudioProcessor model_name = "facebook/sam-audio-base" print(f"Loading {model_name}...") model = SAMAudio.from_pretrained(model_name).to(device, dtype).eval() processor = SAMAudioProcessor.from_pretrained(model_name) if torch.cuda.is_available(): print(f"GPU Memory after loading: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") # Video file video_file = "office.mp4" description = "walking sound" print(f"\nProcessing video: {video_file}") print(f"Description: '{description}'") # Process inputs = processor(audios=[video_file], descriptions=[description]).to(device) print("Running separation...") if torch.cuda.is_available(): print(f"GPU Memory before separation: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") with torch.inference_mode(), torch.autocast(device_type=device.type, dtype=dtype): result = model.separate(inputs) if torch.cuda.is_available(): print(f"GPU Memory after separation: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") # Save results sample_rate = processor.audio_sampling_rate target_audio = result.target[0].float().unsqueeze(0).cpu() residual_audio = result.residual[0].float().unsqueeze(0).cpu() torchaudio.save("video_target.wav", target_audio, sample_rate) torchaudio.save("video_residual.wav", residual_audio, sample_rate) print("\nDone!") print("- video_target.wav: Extracted audio (target)") print("- video_residual.wav: Remaining audio (residual)") # Cleanup del model, processor, inputs, result if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() print(f"GPU Memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.2f} GB") if __name__ == "__main__": main()