Repository: Huanshere/VideoLingo Branch: main Commit: 29e240dcd2a1 Files: 132 Total size: 541.1 KB Directory structure: gitextract_wyfw8khg/ ├── .cursorrules ├── .gitignore ├── .streamlit/ │ └── config.toml ├── Dockerfile ├── LICENSE ├── OneKeyStart.bat ├── README.md ├── VideoLingo_colab.ipynb ├── batch/ │ ├── OneKeyBatch.bat │ ├── README.md │ ├── README.zh.md │ └── utils/ │ ├── batch_processor.py │ ├── settings_check.py │ └── video_processor.py ├── config.yaml ├── core/ │ ├── _10_gen_audio.py │ ├── _11_merge_audio.py │ ├── _12_dub_to_vid.py │ ├── _1_ytdlp.py │ ├── _2_asr.py │ ├── _3_1_split_nlp.py │ ├── _3_2_split_meaning.py │ ├── _4_1_summarize.py │ ├── _4_2_translate.py │ ├── _5_split_sub.py │ ├── _6_gen_sub.py │ ├── _7_sub_into_vid.py │ ├── _8_1_audio_task.py │ ├── _8_2_dub_chunks.py │ ├── _9_refer_audio.py │ ├── __init__.py │ ├── asr_backend/ │ │ ├── __init__.py │ │ ├── audio_preprocess.py │ │ ├── demucs_vl.py │ │ ├── elevenlabs_asr.py │ │ ├── whisperX_302.py │ │ └── whisperX_local.py │ ├── prompts.py │ ├── spacy_utils/ │ │ ├── __init__.py │ │ ├── load_nlp_model.py │ │ ├── split_by_comma.py │ │ ├── split_by_connector.py │ │ ├── split_by_mark.py │ │ └── split_long_by_root.py │ ├── st_utils/ │ │ ├── __init__.py │ │ ├── download_video_section.py │ │ ├── imports_and_utils.py │ │ └── sidebar_setting.py │ ├── translate_lines.py │ ├── tts_backend/ │ │ ├── _302_f5tts.py │ │ ├── azure_tts.py │ │ ├── custom_tts.py │ │ ├── edge_tts.py │ │ ├── estimate_duration.py │ │ ├── fish_tts.py │ │ ├── gpt_sovits_tts.py │ │ ├── openai_tts.py │ │ ├── sf_cosyvoice2.py │ │ ├── sf_fishtts.py │ │ └── tts_main.py │ └── utils/ │ ├── __init__.py │ ├── ask_gpt.py │ ├── config_utils.py │ ├── decorator.py │ ├── delete_retry_dubbing.py │ ├── models.py │ ├── onekeycleanup.py │ └── pypi_autochoose.py ├── custom_terms.xlsx ├── docs/ │ ├── .gitignore │ ├── components/ │ │ ├── landing/ │ │ │ ├── comments.tsx │ │ │ ├── faq.tsx │ │ │ ├── features.tsx │ │ │ ├── github-stats.tsx │ │ │ ├── hero.tsx │ │ │ └── index.tsx │ │ └── ui/ │ │ ├── accordion.tsx │ │ ├── badge.tsx │ │ ├── button.tsx │ │ ├── card.tsx │ │ ├── hero-video-dialog.tsx │ │ ├── rainbow-button.tsx │ │ └── tooltip.tsx │ ├── components.json │ ├── lib/ │ │ └── utils.ts │ ├── middleware.js │ ├── next-env.d.ts │ ├── next.config.js │ ├── package.json │ ├── pages/ │ │ ├── _app.mdx │ │ ├── _meta.en-US.json │ │ ├── _meta.ja.json │ │ ├── _meta.zh-CN.json │ │ ├── docs/ │ │ │ ├── _meta.en-US.json │ │ │ ├── _meta.ja.json │ │ │ ├── _meta.zh-CN.json │ │ │ ├── docker.en-US.md │ │ │ ├── docker.zh-CN.md │ │ │ ├── introduction.en-US.md │ │ │ ├── introduction.zh-CN.md │ │ │ ├── start.en-US.md │ │ │ ├── start.zh-CN.md │ │ │ ├── tech.en-US.md │ │ │ └── tech.zh-CN.md │ │ ├── globals.css │ │ ├── index.en-US.mdx │ │ ├── index.ja.mdx │ │ └── index.zh-CN.mdx │ ├── postcss.config.js │ ├── public/ │ │ └── site.webmanifest │ ├── tailwind.config.js │ ├── theme.config.jsx │ └── tsconfig.json ├── install.py ├── launch.py ├── requirements.txt ├── setup.py ├── st.py └── translations/ ├── README.es.md ├── README.fr.md ├── README.ja.md ├── README.ru.md ├── README.zh-TW.md ├── README.zh.md ├── en.json ├── es.json ├── fr.json ├── ja.json ├── ru.json ├── translations.py ├── zh-CN.json └── zh-HK.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cursorrules ================================================ 2. 使用 # ------------ # comment # ------------ 进行大块的注释 3. 避免使用复杂的函数内注释,以及函数变量中不要有类型定义 4. 使用英文注释和print ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # history and output /output/ /history/ _model_cache/ batch/input/ batch/output/ batch/tasks_setting.xlsx # large files /ffmpeg.exe /ffmpeg /ffprobe.exe /ffprobe .DS_Store AllinOne.ipynb config.backup.yaml # runtime runtime/ dev/ installer_files/ logs/ ================================================ FILE: .streamlit/config.toml ================================================ [server] maxUploadSize = 4096 ================================================ FILE: Dockerfile ================================================ ARG CUDA_VERSION=12.4.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 # Set environment variables ENV DEBIAN_FRONTEND=noninteractive ARG PYTHON_VERSION=3.10 # Change software sources and install basic tools and system dependencies RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \ sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \ apt-get update && apt-get install -y --no-install-recommends \ software-properties-common git curl sudo ffmpeg fonts-noto wget \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -y \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version # Clean apt cache RUN apt-get clean && rm -rf /var/lib/apt/lists/* # Workaround for CUDA compatibility issues RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # Set working directory and clone repository WORKDIR /app RUN git clone https://github.com/Huanshere/VideoLingo.git . # Install PyTorch and torchaudio RUN pip install torch==2.0.0 torchaudio==2.0.0 --index-url https://download.pytorch.org/whl/cu118 # Clean up unnecessary files RUN rm -rf .git # Upgrade pip and install basic dependencies RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple # Install dependencies COPY requirements.txt . RUN pip install -e . # Set CUDA-related environment variables ENV CUDA_HOME=/usr/local/cuda ENV PATH=${CUDA_HOME}/bin:${PATH} ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} # Set CUDA architecture list ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} EXPOSE 8501 CMD ["streamlit", "run", "st.py"] ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: OneKeyStart.bat ================================================ @echo off chcp 65001 >nul 2>&1 call conda activate videolingo 2>nul set PYTHONWARNINGS=ignore python "%~dp0launch.py" if %errorlevel% neq 0 ( echo. echo Pre-flight checks or Streamlit failed. See logs\ for details. echo. ) pause ================================================ FILE: README.md ================================================
VideoLingo Logo # Connect the World, Frame by Frame Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Overview ([Try VL Now!](https://videolingo.io)) VideoLingo is an all-in-one video translation, localization, and dubbing tool aimed at generating Netflix-quality subtitles. It eliminates stiff machine translations and multi-line subtitles while adding high-quality dubbing, enabling global knowledge sharing across language barriers. Key features: - 🎥 YouTube video download via yt-dlp - **🎙️ Word-level and Low-illusion subtitle recognition with WhisperX** - **📝 NLP and AI-powered subtitle segmentation** - **📚 Custom + AI-generated terminology for coherent translation** - **🔄 3-step Translate-Reflect-Adaptation for cinematic quality** - **✅ Netflix-standard, Single-line subtitles Only** - **🗣️ Dubbing with GPT-SoVITS, Azure, OpenAI, and more** - 🚀 One-click startup and processing in Streamlit - 🌍 Multi-language support in Streamlit UI - 📝 Detailed logging with progress resumption Difference from similar projects: **Single-line subtitles only, superior translation quality, seamless dubbing experience** ## 🎥 Demo
### Dual Subtitles --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Cosy2 Voice Clone --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS with my voice --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Language Support **Input Language Support(more to come):** 🇺🇸 English 🤩 | 🇷🇺 Russian 😊 | 🇫🇷 French 🤩 | 🇩🇪 German 🤩 | 🇮🇹 Italian 🤩 | 🇪🇸 Spanish 🤩 | 🇯🇵 Japanese 😐 | 🇨🇳 Chinese* 😊 > *Chinese uses a separate punctuation-enhanced whisper model, for now... **Translation supports all languages, while dubbing language depends on the chosen TTS method.** ## Installation Meet any problem? Chat with our free online AI agent [**here**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) to help you. > **Note:** For Windows users with NVIDIA GPU, follow these steps before installation: > 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. Install [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to your system PATH > 4. Restart your computer > **Note:** FFmpeg is required. Please install it via package managers: > - Windows: ```choco install ffmpeg``` (via [Chocolatey](https://chocolatey.org/)) > - macOS: ```brew install ffmpeg``` (via [Homebrew](https://brew.sh/)) > - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu) 1. Clone the repository ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Install dependencies(requires `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. Start the application ```bash streamlit run st.py ``` ### Docker Alternatively, you can use Docker (requires CUDA 12.4 and NVIDIA Driver version >550), see [Docker docs](/docs/pages/docs/docker.en-US.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## APIs VideoLingo supports OpenAI-Like API format and various TTS interfaces: - LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (sorted by performance, be cautious with gemini-2.5-flash...) - WhisperX: Run whisperX (large-v3) locally or use 302.ai API - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(You can modify your own TTS in custom_tts.py!) > **Note:** VideoLingo works with **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - one API key for all services (LLM, WhisperX, TTS). Or run locally with Ollama and Edge-TTS for free, no API needed! For detailed installation, API configuration, and batch mode instructions, please refer to the documentation: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## Current Limitations 1. WhisperX transcription performance may be affected by video background noise, as it uses wav2vac model for alignment. For videos with loud background music, please enable Voice Separation Enhancement. Additionally, subtitles ending with numbers or special characters may be truncated early due to wav2vac's inability to map numeric characters (e.g., "1") to their spoken form ("one"). 2. Using weaker models can lead to errors during processes due to strict JSON format requirements for responses (tried my best to prompt llm😊). If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error. 3. The dubbing feature may not be 100% perfect due to differences in speech rates and intonation between languages, as well as the impact of the translation step. However, this project has implemented extensive engineering processing for speech rates to ensure the best possible dubbing results. 4. **Multilingual video transcription recognition will only retain the main language**. This is because whisperX uses a specialized model for a single language when forcibly aligning word-level subtitles, and will delete unrecognized languages. 5. **For now, cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable. ## 📄 License This project is licensed under the Apache 2.0 License. Special thanks to the following open source projects for their contributions: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 Contact Me - Submit [Issues](https://github.com/Huanshere/VideoLingo/issues) or [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) on GitHub - DM me on Twitter: [@Huanshere](https://twitter.com/Huanshere) - Email me at: team@videolingo.io ## ⭐ Star History [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

If you find VideoLingo helpful, please give me a ⭐️!

================================================ FILE: VideoLingo_colab.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": { "id": "RkeSbYF2HoM_" }, "source": [ "# Welcome to VideoLingo! 🎉🚀\n", "#### This colab file allows you to quickly experience the full functionality in just 5 minutes! ⏱️✨ Before you begin, you may need to prepare some keys. 🔑🗝️ Please read https://videolingo.io/docs/start to get ready. 📚👍\n", "#### *Please use a T4 GPU to execute this colab for optimal performance." ] }, { "cell_type": "markdown", "metadata": { "id": "h0jE67Gc-1nO" }, "source": [ "## 1. Clone VideoLingo repo 📥" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NC3i2T7D51oS", "outputId": "19821917-8ee4-4123-a099-fd35405fcdfe" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into 'VideoLingo'...\n", "remote: Enumerating objects: 2578, done.\u001b[K\n", "remote: Counting objects: 100% (595/595), done.\u001b[K\n", "remote: Compressing objects: 100% (221/221), done.\u001b[K\n", "remote: Total 2578 (delta 408), reused 378 (delta 374), pack-reused 1983 (from 1)\u001b[K\n", "Receiving objects: 100% (2578/2578), 10.44 MiB | 12.60 MiB/s, done.\n", "Resolving deltas: 100% (1644/1644), done.\n" ] } ], "source": [ "!git clone https://github.com/Huanshere/VideoLingo.git\n", "%cd VideoLingo" ] }, { "cell_type": "markdown", "metadata": { "id": "A5sIHzRs8JI1" }, "source": [ "## 2. Installation 🚀\n", "this takes around 4 mins" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tSymHAEg6Vzr", "outputId": "8c5059e4-37d5-4540-cba5-9c7aa46fe226" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (13.8.1)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich) (3.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich) (2.18.0)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich) (0.1.2)\n", "\u001b[1;35m╭──────────────────────────╮\u001b[0m\n", "\u001b[1;35m│\u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mStarting installation...\u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m│\u001b[0m\n", "\u001b[1;35m╰──────────────────────────╯\u001b[0m\n", "config.py file has been created. Please fill in the API key and base URL in the config.py file.\n", "\u001b[36m╭──────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", "\u001b[36m│\u001b[0m\u001b[36m \u001b[0m\u001b[36mInstalling requests...\u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m│\u001b[0m\n", "\u001b[36m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.32.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2024.8.30)\n", "\u001b[3m Whisper Model Selection \u001b[0m\n", "┏━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1mOption\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDescription \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", "│\u001b[36m \u001b[0m\u001b[36m1 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mwhisperX 💻 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mlocal model (can also use online model api)\u001b[0m\u001b[32m \u001b[0m│\n", "│\u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mwhisperXapi ☁️\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32monline model through api only \u001b[0m\u001b[32m \u001b[0m│\n", "└────────┴───────────────┴─────────────────────────────────────────────┘\n", "If you're unsure about the differences between models, please see \n", "\u001b[4;94mhttps://github.com/Huanshere/VideoLingo/\u001b[0m\n", "\u001b[36m╭──────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", "\u001b[36m│\u001b[0m\u001b[36m \u001b[0m\u001b[36mInstalling PyTorch with CUDA support...\u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m│\u001b[0m\n", "\u001b[36m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "Looking in indexes: https://download.pytorch.org/whl/cu118\n", "Collecting torch==2.0.0\n", " Downloading https://download.pytorch.org/whl/cu118/torch-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 GB\u001b[0m \u001b[31m626.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting torchaudio==2.0.0\n", " Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (4.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.16.1)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.1.4)\n", "Collecting triton==2.0.0 (from torch==2.0.0)\n", " Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.3/63.3 MB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.0) (3.30.3)\n", "Collecting lit (from triton==2.0.0->torch==2.0.0)\n", " Downloading https://download.pytorch.org/whl/lit-15.0.7.tar.gz (132 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.3/132.3 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.0.0) (2.1.5)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.0.0) (1.3.0)\n", "Building wheels for collected packages: lit\n", " Building wheel for lit (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for lit: filename=lit-15.0.7-py3-none-any.whl size=89990 sha256=f9f0ca0ce885e2ddbc5583433ea047a853b4034c8e2712e685d93fbed24f9204\n", " Stored in directory: /root/.cache/pip/wheels/27/2c/b6/3ed2983b1b44fe0dea1bb35234b09f2c22fb8ebb308679c922\n", "Successfully built lit\n", "Installing collected packages: lit, triton, torch, torchaudio\n", " Attempting uninstall: torch\n", " Found existing installation: torch 2.4.1+cu121\n", " Uninstalling torch-2.4.1+cu121:\n", " Successfully uninstalled torch-2.4.1+cu121\n", " Attempting uninstall: torchaudio\n", " Found existing installation: torchaudio 2.4.1+cu121\n", " Uninstalling torchaudio-2.4.1+cu121:\n", " Successfully uninstalled torchaudio-2.4.1+cu121\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "torchvision 0.19.1+cu121 requires torch==2.4.1, but you have torch 2.0.0+cu118 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed lit-15.0.7 torch-2.0.0+cu118 torchaudio-2.0.0+cu118 triton-2.0.0\n", "Installing whisperX...\n", "Obtaining file:///content/VideoLingo/third_party/whisperX\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: torch>=2 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.0.0+cu118)\n", "Requirement already satisfied: torchaudio>=2 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.0.0+cu118)\n", "Collecting faster-whisper==1.0.0 (from whisperx==3.1.1)\n", " Downloading faster_whisper-1.0.0-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (4.44.2)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.2.2)\n", "Requirement already satisfied: setuptools>=65 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (71.0.4)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (3.8.1)\n", "Collecting pyannote.audio==3.1.1 (from whisperx==3.1.1)\n", " Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)\n", "Collecting av==11.* (from faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)\n", "Collecting ctranslate2<5,>=4.0 (from faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading ctranslate2-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n", "Requirement already satisfied: huggingface-hub>=0.13 in /usr/local/lib/python3.10/dist-packages (from faster-whisper==1.0.0->whisperx==3.1.1) (0.24.7)\n", "Collecting tokenizers<0.16,>=0.13 (from faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", "Collecting onnxruntime<2,>=1.14 (from faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)\n", "Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)\n", "Requirement already satisfied: einops>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (0.8.0)\n", "Collecting lightning>=2.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)\n", "Collecting omegaconf<3.0,>=2.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)\n", "Collecting pyannote.core>=5.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)\n", "Collecting pyannote.database>=5.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pyannote.database-5.1.0-py3-none-any.whl.metadata (1.2 kB)\n", "Collecting pyannote.metrics>=3.2 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)\n", "Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)\n", "Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl.metadata (17 kB)\n", "Requirement already satisfied: rich>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (13.8.1)\n", "Collecting semver>=3.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading semver-3.0.2-py3-none-any.whl.metadata (5.0 kB)\n", "Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.1)\n", "Collecting speechbrain>=0.5.14 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading speechbrain-1.0.1-py3-none-any.whl.metadata (24 kB)\n", "Collecting tensorboardX>=2.6 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)\n", "Collecting torch-audiomentations>=0.11.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading torch_audiomentations-0.11.1-py3-none-any.whl.metadata (14 kB)\n", "Collecting torchmetrics>=0.11.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.16.1)\n", "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (4.12.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (1.13.3)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.1.4)\n", "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (2.0.0)\n", "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=2->whisperx==3.1.1) (3.30.3)\n", "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=2->whisperx==3.1.1) (15.0.7)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (8.1.7)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (1.4.2)\n", "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (2024.9.11)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (4.66.5)\n", "Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (1.26.4)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2024.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (24.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (6.0.2)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (2.32.3)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (0.4.5)\n", "INFO: pip is looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.\n", "Collecting transformers (from whisperx==3.1.1)\n", " Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.44.1-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.43.2-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m552.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hINFO: pip is still looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.\n", " Downloading transformers-4.43.1-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.43.0-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.42.2-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hINFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n", " Downloading transformers-4.42.1-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.42.0-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.6/137.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.13->faster-whisper==1.0.0->whisperx==3.1.1) (2024.6.1)\n", "Collecting lightning-utilities<2.0,>=0.10.0 (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)\n", "INFO: pip is looking at multiple versions of lightning to determine which version is compatible with other requirements. This could take a while.\n", "Collecting lightning>=2.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading lightning-2.3.3-py3-none-any.whl.metadata (35 kB)\n", "Collecting pytorch-lightning (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)\n", "Collecting antlr4-python3-runtime==4.9.* (from omegaconf<3.0,>=2.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n", "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1) (24.3.25)\n", "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1) (3.20.3)\n", "Requirement already satisfied: sortedcontainers>=2.0.4 in /usr/local/lib/python3.10/dist-packages (from pyannote.core>=5.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (2.4.0)\n", "Requirement already satisfied: scipy>=1.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.core>=5.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.13.1)\n", "Requirement already satisfied: typer>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.database>=5.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.5)\n", "Requirement already satisfied: scikit-learn>=0.17.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.5.2)\n", "Collecting docopt>=0.6.2 (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading docopt-0.6.2.tar.gz (25 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (0.9.0)\n", "Requirement already satisfied: matplotlib>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.7.1)\n", "Collecting optuna>=3.1 (from pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->whisperx==3.1.1) (1.16.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (3.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (2.18.0)\n", "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.17.1)\n", "Collecting hyperpyyaml (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1) (0.2.0)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=2->whisperx==3.1.1) (1.3.0)\n", "Collecting julius<0.3,>=0.2.3 (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading julius-0.2.7.tar.gz (59 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.6/59.6 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: librosa>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.10.2.post1)\n", "Collecting torch-pitch-shift>=1.2.2 (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading torch_pitch_shift-1.2.5-py3-none-any.whl.metadata (2.5 kB)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=2->whisperx==3.1.1) (2.1.5)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (2024.8.30)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.22)\n", "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (3.10.6)\n", "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (3.0.1)\n", "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (4.4.2)\n", "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.60.0)\n", "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.8.2)\n", "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.5.0.post1)\n", "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.4)\n", "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.0.8)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.1.2)\n", "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.3.0)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.1)\n", "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (4.54.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.4.7)\n", "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (10.4.0)\n", "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.1.4)\n", "Collecting alembic>=1.5.0 (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)\n", "Collecting colorlog (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.0.35)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.17.1->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.5.0)\n", "Collecting primePy>=1.3 (from torch-pitch-shift>=1.2.2->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading primePy-1.3-py3-none-any.whl.metadata (4.8 kB)\n", "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer>=0.12.1->pyannote.database>=5.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.5.4)\n", "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1)\n", " Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n", "Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)\n", "INFO: pip is looking at multiple versions of pytorch-lightning to determine which version is compatible with other requirements. This could take a while.\n", "Collecting pytorch-lightning (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading pytorch_lightning-2.3.3-py3-none-any.whl.metadata (21 kB)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.4.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.3.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (24.2.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (6.1.0)\n", "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.12.1)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (4.0.3)\n", "Collecting Mako (from alembic>=1.5.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)\n", "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.43.0)\n", "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (4.3.6)\n", "Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n", " Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)\n", "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (3.1.1)\n", "Downloading faster_whisper-1.0.0-py3-none-any.whl (1.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyannote.audio-3.1.1-py2.py3-none-any.whl (208 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m208.7/208.7 kB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m32.9/32.9 MB\u001b[0m \u001b[31m54.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading transformers-4.39.3-py3-none-any.whl (8.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m68.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)\n", "Downloading ctranslate2-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.2/37.2 MB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading lightning-2.3.3-py3-none-any.whl (808 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m808.5/808.5 kB\u001b[0m \u001b[31m39.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.2/13.2 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyannote.core-5.0.0-py3-none-any.whl (58 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyannote.database-5.1.0-py3-none-any.whl (48 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.1/48.1 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyannote.metrics-3.2.1-py3-none-any.whl (51 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.4/51.4 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyannote.pipeline-3.0.1-py3-none-any.whl (31 kB)\n", "Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl (119 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.3/119.3 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading semver-3.0.2-py3-none-any.whl (17 kB)\n", "Downloading speechbrain-1.0.1-py3-none-any.whl (807 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m807.2/807.2 kB\u001b[0m \u001b[31m47.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.7/101.7 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading torch_audiomentations-0.11.1-py3-none-any.whl (50 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.1/50.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading torchmetrics-1.4.2-py3-none-any.whl (869 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m869.2/869.2 kB\u001b[0m \u001b[31m51.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)\n", "Downloading optuna-4.0.0-py3-none-any.whl (362 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m362.8/362.8 kB\u001b[0m \u001b[31m27.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading torch_pitch_shift-1.2.5-py3-none-any.whl (5.0 kB)\n", "Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)\n", "Downloading pytorch_lightning-2.3.3-py3-none-any.whl (812 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m812.3/812.3 kB\u001b[0m \u001b[31m48.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.2/233.2 kB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading primePy-1.3-py3-none-any.whl (4.0 kB)\n", "Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.8/117.8 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n", "Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.7/526.7 kB\u001b[0m \u001b[31m33.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hBuilding wheels for collected packages: antlr4-python3-runtime, docopt, julius\n", " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=0d45790bcba89ef25b40e28a352826b1e3b8e0a996f3c4c71c77a2e039838c51\n", " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n", " Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=0a394444811d2361dc868cfb7f23c797a47cad9a1ee5485aa86c20551b33a0fe\n", " Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac\n", " Building wheel for julius (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for julius: filename=julius-0.2.7-py3-none-any.whl size=21869 sha256=c4d0f47e1e8d2846ed38f7ba03d944e5ed1068278a16c671c7b946053b7f7446\n", " Stored in directory: /root/.cache/pip/wheels/b9/b2/05/f883527ffcb7f2ead5438a2c23439aa0c881eaa9a4c80256f4\n", "Successfully built antlr4-python3-runtime docopt julius\n", "Installing collected packages: primePy, docopt, antlr4-python3-runtime, tensorboardX, semver, ruamel.yaml.clib, omegaconf, Mako, lightning-utilities, humanfriendly, ctranslate2, colorlog, av, ruamel.yaml, pyannote.core, coloredlogs, alembic, tokenizers, optuna, onnxruntime, hyperpyyaml, transformers, pyannote.database, faster-whisper, pyannote.pipeline, pyannote.metrics, torchmetrics, torch-pitch-shift, pytorch-lightning, julius, torch-audiomentations, speechbrain, pytorch-metric-learning, lightning, asteroid-filterbanks, pyannote.audio, whisperx\n", " Attempting uninstall: tokenizers\n", " Found existing installation: tokenizers 0.19.1\n", " Uninstalling tokenizers-0.19.1:\n", " Successfully uninstalled tokenizers-0.19.1\n", " Attempting uninstall: transformers\n", " Found existing installation: transformers 4.44.2\n", " Uninstalling transformers-4.44.2:\n", " Successfully uninstalled transformers-4.44.2\n", " Running setup.py develop for whisperx\n", "Successfully installed Mako-1.3.5 alembic-1.13.3 antlr4-python3-runtime-4.9.3 asteroid-filterbanks-0.4.0 av-11.0.0 coloredlogs-15.0.1 colorlog-6.8.2 ctranslate2-4.4.0 docopt-0.6.2 faster-whisper-1.0.0 humanfriendly-10.0 hyperpyyaml-1.2.2 julius-0.2.7 lightning-2.3.3 lightning-utilities-0.11.7 omegaconf-2.3.0 onnxruntime-1.19.2 optuna-4.0.0 primePy-1.3 pyannote.audio-3.1.1 pyannote.core-5.0.0 pyannote.database-5.1.0 pyannote.metrics-3.2.1 pyannote.pipeline-3.0.1 pytorch-lightning-2.3.3 pytorch-metric-learning-2.6.0 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.8 semver-3.0.2 speechbrain-1.0.1 tensorboardX-2.6.2.2 tokenizers-0.15.2 torch-audiomentations-0.11.1 torch-pitch-shift-1.2.5 torchmetrics-1.4.2 transformers-4.39.3 whisperx-3.1.1\n", "Converting requirements.txt to GBK encoding...\n", "Conversion completed.\n", "Installing dependencies from requirements.txt...\n", "Collecting azure-cognitiveservices-speech==1.40.0 (from -r requirements.txt (line 1))\n", " Downloading azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n", "Requirement already satisfied: librosa==0.10.2.post1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (0.10.2.post1)\n", "Requirement already satisfied: moviepy==1.0.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (1.0.3)\n", "Requirement already satisfied: numpy==1.26.4 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (1.26.4)\n", "Collecting openai==1.47.0 (from -r requirements.txt (line 5))\n", " Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)\n", "Requirement already satisfied: opencv-python==4.10.0.84 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (4.10.0.84)\n", "Requirement already satisfied: openpyxl==3.1.5 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (3.1.5)\n", "Collecting pandas==2.2.3 (from -r requirements.txt (line 8))\n", " Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.9/89.9 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pydub==0.25.1 (from -r requirements.txt (line 9))\n", " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n", "Requirement already satisfied: PyYAML==6.0.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 10)) (6.0.2)\n", "Collecting replicate==0.33.0 (from -r requirements.txt (line 11))\n", " Downloading replicate-0.33.0-py3-none-any.whl.metadata (25 kB)\n", "Requirement already satisfied: requests==2.32.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 12)) (2.32.3)\n", "Collecting resampy==0.4.3 (from -r requirements.txt (line 13))\n", " Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)\n", "Requirement already satisfied: spacy==3.7.6 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 14)) (3.7.6)\n", "Collecting streamlit==1.38.0 (from -r requirements.txt (line 15))\n", " Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)\n", "Collecting yt-dlp==2024.8.6 (from -r requirements.txt (line 16))\n", " Downloading yt_dlp-2024.8.6-py3-none-any.whl.metadata (170 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m170.1/170.1 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting json-repair (from -r requirements.txt (line 17))\n", " Downloading json_repair-0.29.7-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (3.0.1)\n", "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.13.1)\n", "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.5.2)\n", "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.4.2)\n", "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.4.2)\n", "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.60.0)\n", "Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.12.1)\n", "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.8.2)\n", "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.5.0.post1)\n", "Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.12.2)\n", "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.4)\n", "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.0.8)\n", "Requirement already satisfied: tqdm<5.0,>=4.11.2 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (4.66.5)\n", "Requirement already satisfied: proglog<=1.0.0 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (0.1.10)\n", "Requirement already satisfied: imageio<3.0,>=2.5 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (2.35.1)\n", "Requirement already satisfied: imageio-ffmpeg>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (0.5.1)\n", "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (3.7.1)\n", "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (1.7.0)\n", "Collecting httpx<1,>=0.23.0 (from openai==1.47.0->-r requirements.txt (line 5))\n", " Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n", "Collecting jiter<1,>=0.4.0 (from openai==1.47.0->-r requirements.txt (line 5))\n", " Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n", "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (2.9.2)\n", "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (1.3.1)\n", "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl==3.1.5->-r requirements.txt (line 7)) (1.1.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2024.2)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from replicate==0.33.0->-r requirements.txt (line 11)) (24.1)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (2024.8.30)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.0.12)\n", "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.0.5)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.0.10)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.0.8)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.0.9)\n", "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (8.2.5)\n", "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.1.3)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.4.8)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.0.10)\n", "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (0.4.1)\n", "Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (0.12.5)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.1.4)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (71.0.4)\n", "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.4.1)\n", "Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (4.2.2)\n", "Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (1.4)\n", "Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (5.5.0)\n", "Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (8.1.7)\n", "Requirement already satisfied: pillow<11,>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (10.4.0)\n", "Requirement already satisfied: protobuf<6,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (3.20.3)\n", "Requirement already satisfied: pyarrow>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (16.1.0)\n", "Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (13.8.1)\n", "Collecting tenacity<9,>=8.1.0 (from streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)\n", "Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (0.10.2)\n", "Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)\n", "Collecting pydeck<1,>=0.8.0b4 (from streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)\n", "Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (6.3.3)\n", "Collecting watchdog<5,>=2.1.5 (from streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)\n", "Collecting brotli (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n", " Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)\n", "Collecting mutagen (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n", " Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)\n", "Collecting pycryptodomex (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n", " Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n", "Collecting websockets>=12.0 (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n", " Downloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n", "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.4)\n", "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (4.23.0)\n", "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.12.1)\n", "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai==1.47.0->-r requirements.txt (line 5)) (1.2.2)\n", "Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n", "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.47.0->-r requirements.txt (line 5))\n", " Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)\n", "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.47.0->-r requirements.txt (line 5))\n", " Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n", "Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.10/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.2.0)\n", "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.43.0)\n", "Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.3.6)\n", "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai==1.47.0->-r requirements.txt (line 5)) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai==1.47.0->-r requirements.txt (line 5)) (2.23.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->spacy==3.7.6->-r requirements.txt (line 14)) (2.1.5)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas==2.2.3->-r requirements.txt (line 8)) (1.16.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (3.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (2.18.0)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa==0.10.2.post1->-r requirements.txt (line 2)) (3.5.0)\n", "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.17.1)\n", "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r requirements.txt (line 14)) (0.7.11)\n", "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r requirements.txt (line 14)) (0.1.5)\n", "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0.0,>=0.3.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.5.4)\n", "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (0.19.0)\n", "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (7.0.4)\n", "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (2.22)\n", "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit==1.38.0->-r requirements.txt (line 15))\n", " Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n", "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (24.2.0)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (2023.12.1)\n", "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.35.1)\n", "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.20.0)\n", "Requirement already satisfied: marisa-trie>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.2.0)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.1.2)\n", "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.16.0)\n", "Downloading azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl (40.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.1/40.1 MB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading openai-1.47.0-py3-none-any.whl (375 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m375.6/375.6 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m73.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", "Downloading replicate-0.33.0-py3-none-any.whl (45 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.3/45.3 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m69.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading yt_dlp-2024.8.6-py3-none-any.whl (3.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading json_repair-0.29.7-py3-none-any.whl (17 kB)\n", "Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.3/207.3 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (318 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.9/6.9 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tenacity-8.5.0-py3-none-any.whl (28 kB)\n", "Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl (82 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.9/82.9 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (164 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m66.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading mutagen-1.47.0-py3-none-any.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.4/194.4 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n", "Installing collected packages: pydub, brotli, websockets, watchdog, tenacity, smmap, pycryptodomex, mutagen, json-repair, jiter, h11, azure-cognitiveservices-speech, yt-dlp, resampy, pydeck, pandas, httpcore, gitdb, httpx, gitpython, replicate, openai, streamlit\n", " Attempting uninstall: tenacity\n", " Found existing installation: tenacity 9.0.0\n", " Uninstalling tenacity-9.0.0:\n", " Successfully uninstalled tenacity-9.0.0\n", " Attempting uninstall: pandas\n", " Found existing installation: pandas 2.2.2\n", " Uninstalling pandas-2.2.2:\n", " Successfully uninstalled pandas-2.2.2\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "cudf-cu12 24.6.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.\n", "google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed azure-cognitiveservices-speech-1.40.0 brotli-1.1.0 gitdb-4.0.11 gitpython-3.1.43 h11-0.14.0 httpcore-1.0.6 httpx-0.27.2 jiter-0.5.0 json-repair-0.29.7 mutagen-1.47.0 openai-1.47.0 pandas-2.2.3 pycryptodomex-3.21.0 pydeck-0.9.1 pydub-0.25.1 replicate-0.33.0 resampy-0.4.3 smmap-5.0.1 streamlit-1.38.0 tenacity-8.5.0 watchdog-4.0.2 websockets-13.1 yt-dlp-2024.8.6\n", "Downloading UVR model: HP2_all_vocals.pth...\n", "Downloaded: 0.01%\n", "HP2_all_vocals.pth downloaded successfully.\n", "Downloading UVR model: VR-DeEchoAggressive.pth...\n", "Downloaded: 0.00%\n", "VR-DeEchoAggressive.pth downloaded successfully.\n", "Downloading FFmpeg...\n", "FFmpeg has been downloaded to ffmpeg.tar.xz\n", "Extracting FFmpeg...\n", "Cleaning up...\n", "FFmpeg extraction completed.\n", "\u001b[1;32m╭───────────────────────────────────────╮\u001b[0m\n", "\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mAll installation steps are completed!\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n", "\u001b[1;32m╰───────────────────────────────────────╯\u001b[0m\n", "Please use the following command to start Streamlit:\n", "\u001b[1;36mstreamlit run st.py\u001b[0m\n" ] } ], "source": [ "!python install.py" ] }, { "cell_type": "markdown", "metadata": { "id": "5J2jjEY2BrM3" }, "source": [ "## 3. Register and Obtain Ngrok Token 🔑\n", "\n", "1. Visit the [Ngrok official website](https://ngrok.com/) and register for an account.\n", "2. After logging in, find the \"Your Authtoken\" section on the dashboard page, or directly visit [Ngrok Token](https://dashboard.ngrok.com/get-started/your-authtoken).\n", "3. Copy your Ngrok Authtoken.\n", "\n", "After completing these steps, please fill in your ngrok token in the next section of code and proceed.\n", "\n", "---\n", "\n", "## 3. 注册并获取 Ngrok 令牌 🔑\n", "\n", "1. 访问 [Ngrok 官方网站](https://ngrok.com/) 并注册账户。\n", "2. 登录后,在仪表板页面找到\"Your Authtoken\"部分,或直接访问 [Ngrok 令牌](https://dashboard.ngrok.com/get-started/your-authtoken)。\n", "3. 复制您的 Ngrok Authtoken。\n", "\n", "完成这些步骤后,请在下一节代码中填入您的 ngrok 令牌并继续。" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tm25rblnBqhl", "outputId": "93691228-1e7b-48e7-b7f3-c27007a4a13f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pyngrok in /usr/local/lib/python3.10/dist-packages (7.2.0)\n", "Requirement already satisfied: PyYAML>=5.1 in /usr/local/lib/python3.10/dist-packages (from pyngrok) (6.0.2)\n" ] } ], "source": [ "!pip install pyngrok\n", "from pyngrok import ngrok\n", "\n", "#! SET Ngrok Authtoken Here\n", "ngrok.set_auth_token(\"YOUR_TOKEN_HERE\")" ] }, { "cell_type": "markdown", "metadata": { "id": "MDJXtgMuGXMH" }, "source": [ "## 🎈 4. Streamlit GO !!!\n", "Click the NgrokChannel URL to start your VideoLingo Journey.\n", "\n", "> tips: You can set your Language down the sidebar on the left." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 294 }, "id": "qr_a4mS29k5_", "outputId": "e6915708-d030-45d0-d6a8-cb177960c137" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.\n", "\n" ] }, { "data": { "text/html": [ "
╭───────────────────────────────────╮\n",
              "│ Streamlit is available at Ngrok ⬇️ │\n",
              "╰───────────────────────────────────╯\n",
              "
\n" ], "text/plain": [ "╭───────────────────────────────────╮\n", "│ Streamlit is available at Ngrok ⬇️ │\n", "╰───────────────────────────────────╯\n" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Click 👉 NgrokTunnel: \"https://0308-34-125-196-161.ngrok-free.app\" -> \"http://localhost:8501\"\n", "\n", " You can now view your Streamlit app in your browser.\n", "\n", " Local URL: http://localhost:8501\n", " Network URL: http://172.28.0.12:8501\n", " External URL: http://34.125.196.161:8501\n", "\n", " Stopping...\n", "Interrupted by user, shutting down...\n" ] } ], "source": [ "import subprocess\n", "import threading\n", "import sys\n", "from pyngrok import ngrok\n", "from rich import print as rprint\n", "from rich.panel import Panel\n", "\n", "def print_output(process):\n", " for line in iter(process.stdout.readline, ''):\n", " sys.stdout.write(line)\n", " for line in iter(process.stderr.readline, ''):\n", " sys.stderr.write(line)\n", "\n", "# Start Streamlit\n", "streamlit_process = subprocess.Popen(\n", " [\"streamlit\", \"run\", \"st.py\"],\n", " stdout=subprocess.PIPE,\n", " stderr=subprocess.PIPE,\n", " universal_newlines=True,\n", " bufsize=1\n", ")\n", "\n", "# Create and start the output printing thread\n", "output_thread = threading.Thread(target=print_output, args=(streamlit_process,))\n", "output_thread.start()\n", "\n", "# Create a tunnel using ngrok\n", "public_url = ngrok.connect(8501)\n", "rprint(Panel(f\"Streamlit is available at Ngrok ⬇️\", expand=False))\n", "print(f\"Click 👉 {public_url}\")\n", "\n", "# Keep the program running\n", "ngrok_process = ngrok.get_ngrok_process()\n", "try:\n", " streamlit_process.wait()\n", "except KeyboardInterrupt:\n", " print(\"Interrupted by user, shutting down...\")\n", "finally:\n", " ngrok.kill()\n", " streamlit_process.terminate()\n", " output_thread.join()\n" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: batch/OneKeyBatch.bat ================================================ @echo off cd /D "%~dp0" cd .. call conda activate videolingo @rem 运行批处理脚本 call python batch\utils\batch_processor.py :end pause ================================================ FILE: batch/README.md ================================================ # VideoLingo Batch Mode [English](./README.md) | [简体中文](./README.zh.md) Before utilizing the batch mode, ensure you have used the Streamlit mode and properly configured the parameters in `config.yaml`. ## Usage Guide ### 1. Video File Preparation - Place your video files in the `input` folder - YouTube links can be specified in the next step ### 2. Task Configuration Edit the `tasks_setting.xlsx` file: | Field | Description | Acceptable Values | |-------|-------------|-------------------| | Video File | Video filename (without `input/` prefix) or YouTube URL | - | | Source Language | Source language | 'en', 'zh', ... or leave empty for default | | Target Language | Translation language | Use natural language description, or leave empty for default | | Dubbing | Enable dubbing | 0 or empty: no dubbing; 1: enable dubbing | Example: | Video File | Source Language | Target Language | Dubbing | |------------|-----------------|-----------------|---------| | https://www.youtube.com/xxx | | German | | | Kungfu Panda.mp4 | | | 1 | ### 3. Executing Batch Processing 1. Double-click to run `OneKeyBatch.bat` 2. Output files will be saved in the `output` folder 3. Task status can be monitored in the `Status` column of `tasks_setting.xlsx` > Note: Keep `tasks_setting.xlsx` closed during execution to prevent interruptions due to file access conflicts. ## Important Considerations ### Handling Interruptions If the command line is closed unexpectedly, language settings in `config.yaml` may be altered. Check settings before retrying. ### Error Management - Failed files will be moved to the `output/ERROR` folder - Error messages are recorded in the `Status` column of `tasks_setting.xlsx` - To retry: 1. Move the single video folder from `ERROR` to the root directory 2. Rename it to `output` 3. Use Streamlit mode to process again ================================================ FILE: batch/README.zh.md ================================================ # VideoLingo Batch Mode [English](./README.md) | [简体中文](./README.zh.md) 在使用批处理模式前,请确保你已经使用过 Streamlit 模式并正确设置了 `config.yaml` 中的参数。 ## 使用方法 ### 1. 准备视频文件 - 将要处理的视频文件放入 `input` 文件夹 - YouTube 链接可在下一步填写 ### 2. 配置任务 编辑 `tasks_setting.xlsx` 文件: | 字段 | 说明 | 可选值 | |------|------|--------| | Video File | 视频文件名(无需 `input/` 前缀)或 YouTube 链接 | - | | Source Language | 源语言 | 'en', 'zh', ... 或留空使用默认设置 | | Target Language | 翻译语言 | 使用自然语言描述,或留空使用默认设置 | | Dubbing | 是否配音 | 0 或留空:不配音;1:配音 | 示例: | Video File | Source Language | Target Language | Dubbing | |------------|-----------------|-----------------|---------| | https://www.youtube.com/xxx | | German | | | Kungfu Panda.mp4 | | | 1 | ### 3. 运行批处理 1. 双击运行 `OneKeyBatch.bat` 2. 输出文件将保存在 `output` 文件夹 3. 任务状态可在 `tasks_setting.xlsx` 的 `Status` 列查看 > 注意在运行时保持 `tasks_setting.xlsx` 关闭,否则会因占用无法写入而中断。 ## 注意事项 ### 中断处理 如果中途关闭命令行,`config.yaml` 中的语言设置可能会改变。重试前请检查设置。 ### 错误处理 - 处理失败的文件会被移至 `output/ERROR` 文件夹 - 错误信息记录在 `tasks_setting.xlsx` 的 `Status` 列 - 如需重试: 1. 将 `ERROR` 下的单个视频文件夹移至根目录 2. 重命名为 `output` 3. 使用 Streamlit 模式重新执行 ================================================ FILE: batch/utils/batch_processor.py ================================================ import os import gc from batch.utils.settings_check import check_settings from batch.utils.video_processor import process_video from core.utils.config_utils import load_key, update_key import pandas as pd from rich.console import Console from rich.panel import Panel import time import shutil console = Console() def record_and_update_config(source_language, target_language): original_source_lang = load_key('whisper.language') original_target_lang = load_key('target_language') if source_language and not pd.isna(source_language): update_key('whisper.language', source_language) if target_language and not pd.isna(target_language): update_key('target_language', target_language) return original_source_lang, original_target_lang def process_batch(): if not check_settings(): raise Exception("Settings check failed") df = pd.read_excel('batch/tasks_setting.xlsx') for index, row in df.iterrows(): if pd.isna(row['Status']) or 'Error' in str(row['Status']): total_tasks = len(df) video_file = row['Video File'] if not pd.isna(row['Status']) and 'Error' in str(row['Status']): console.print(Panel(f"Retrying failed task: {video_file}\nTask {index + 1}/{total_tasks}", title="[bold yellow]Retry Task", expand=False)) # Restore files from batch/output/ERROR to output error_folder = os.path.join('batch', 'output', 'ERROR', os.path.splitext(video_file)[0]) if os.path.exists(error_folder): # Ensure the output folder exists os.makedirs('output', exist_ok=True) # Copy all contents from ERROR folder for the specific video to output for item in os.listdir(error_folder): src_path = os.path.join(error_folder, item) dst_path = os.path.join('output', item) if os.path.isdir(src_path): if os.path.exists(dst_path): shutil.rmtree(dst_path) shutil.copytree(src_path, dst_path) else: if os.path.exists(dst_path): os.remove(dst_path) shutil.copy2(src_path, dst_path) console.print(f"[green]Restored files from ERROR folder for {video_file}") else: console.print(f"[yellow]Warning: Error folder not found: {error_folder}") else: console.print(Panel(f"Now processing task: {video_file}\nTask {index + 1}/{total_tasks}", title="[bold blue]Current Task", expand=False)) source_language = row['Source Language'] target_language = row['Target Language'] original_source_lang, original_target_lang = record_and_update_config(source_language, target_language) try: dubbing = 0 if pd.isna(row['Dubbing']) else int(row['Dubbing']) is_retry = not pd.isna(row['Status']) and 'Error' in str(row['Status']) status, error_step, error_message = process_video(video_file, dubbing, is_retry) status_msg = "Done" if status else f"Error: {error_step} - {error_message}" except Exception as e: status_msg = f"Error: Unhandled exception - {str(e)}" console.print(f"[bold red]Error processing {video_file}: {status_msg}") finally: update_key('whisper.language', original_source_lang) update_key('target_language', original_target_lang) df.at[index, 'Status'] = status_msg df.to_excel('batch/tasks_setting.xlsx', index=False) gc.collect() time.sleep(1) else: print(f"Skipping task: {row['Video File']} - Status: {row['Status']}") console.print(Panel("All tasks processed!\nCheck out in `batch/output`!", title="[bold green]Batch Processing Complete", expand=False)) if __name__ == "__main__": process_batch() ================================================ FILE: batch/utils/settings_check.py ================================================ import os import pandas as pd from rich.console import Console from rich.panel import Panel # Constants SETTINGS_FILE = 'batch/tasks_setting.xlsx' INPUT_FOLDER = os.path.join('batch', 'input') VALID_DUBBING_VALUES = [0, 1] console = Console() def check_settings(): os.makedirs(INPUT_FOLDER, exist_ok=True) df = pd.read_excel(SETTINGS_FILE) input_files = set(os.listdir(INPUT_FOLDER)) excel_files = set(df['Video File'].tolist()) files_not_in_excel = input_files - excel_files all_passed = True local_video_tasks = 0 url_tasks = 0 if files_not_in_excel: console.print(Panel( "\n".join([f"- {file}" for file in files_not_in_excel]), title="[bold red]Warning: Files in input folder not mentioned in Excel sheet", expand=False )) all_passed = False for index, row in df.iterrows(): video_file = row['Video File'] source_language = row['Source Language'] dubbing = row['Dubbing'] if video_file.startswith('http'): url_tasks += 1 elif os.path.isfile(os.path.join(INPUT_FOLDER, video_file)): local_video_tasks += 1 else: console.print(Panel(f"Invalid video file or URL 「{video_file}」", title=f"[bold red]Error in row {index + 2}", expand=False)) all_passed = False if not pd.isna(dubbing): if int(dubbing) not in VALID_DUBBING_VALUES: console.print(Panel(f"Invalid dubbing value 「{dubbing}」", title=f"[bold red]Error in row {index + 2}", expand=False)) all_passed = False if all_passed: console.print(Panel(f"✅ All settings passed the check!\nDetected {local_video_tasks} local video tasks and {url_tasks} URL tasks.", title="[bold green]Success", expand=False)) return all_passed if __name__ == "__main__": check_settings() ================================================ FILE: batch/utils/video_processor.py ================================================ import os from core.st_utils.imports_and_utils import * from core.utils.onekeycleanup import cleanup from core.utils import load_key import shutil from functools import partial from rich.panel import Panel from rich.console import Console from core import * console = Console() INPUT_DIR = 'batch/input' OUTPUT_DIR = 'output' SAVE_DIR = 'batch/output' ERROR_OUTPUT_DIR = 'batch/output/ERROR' YTB_RESOLUTION_KEY = "ytb_resolution" def process_video(file, dubbing=False, is_retry=False): if not is_retry: prepare_output_folder(OUTPUT_DIR) text_steps = [ ("🎥 Processing input file", partial(process_input_file, file)), ("🎙️ Transcribing with Whisper", partial(_2_asr.transcribe)), ("✂️ Splitting sentences", split_sentences), ("📝 Summarizing and translating", summarize_and_translate), ("⚡ Processing and aligning subtitles", process_and_align_subtitles), ("🎬 Merging subtitles to video", _7_sub_into_vid.merge_subtitles_to_video), ] if dubbing: dubbing_steps = [ ("🔊 Generating audio tasks", gen_audio_tasks), ("🎵 Extracting reference audio", _9_refer_audio.extract_refer_audio_main), ("🗣️ Generating audio", _10_gen_audio.gen_audio), ("🔄 Merging full audio", _11_merge_audio.merge_full_audio), ("🎞️ Merging dubbing to video", _12_dub_to_vid.merge_video_audio), ] text_steps.extend(dubbing_steps) current_step = "" for step_name, step_func in text_steps: current_step = step_name for attempt in range(3): try: console.print(Panel( f"[bold green]{step_name}[/]", subtitle=f"Attempt {attempt + 1}/3" if attempt > 0 else None, border_style="blue" )) result = step_func() if result is not None: globals().update(result) break except Exception as e: if attempt == 2: error_panel = Panel( f"[bold red]Error in step '{current_step}':[/]\n{str(e)}", border_style="red" ) console.print(error_panel) cleanup(ERROR_OUTPUT_DIR) return False, current_step, str(e) console.print(Panel( f"[yellow]Attempt {attempt + 1} failed. Retrying...[/]", border_style="yellow" )) console.print(Panel("[bold green]All steps completed successfully! 🎉[/]", border_style="green")) cleanup(SAVE_DIR) return True, "", "" def prepare_output_folder(output_folder): if os.path.exists(output_folder): shutil.rmtree(output_folder) os.makedirs(output_folder) def process_input_file(file): if file.startswith('http'): _1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY)) video_file = _1_ytdlp.find_video_files() else: input_file = os.path.join('batch', 'input', file) output_file = os.path.join(OUTPUT_DIR, file) shutil.copy(input_file, output_file) video_file = output_file return {'video_file': video_file} def split_sentences(): _3_1_split_nlp.split_by_spacy() _3_2_split_meaning.split_sentences_by_meaning() def summarize_and_translate(): _4_1_summarize.get_summary() _4_2_translate.translate_all() def process_and_align_subtitles(): _5_split_sub.split_for_sub_main() _6_gen_sub.align_timestamp_main() def gen_audio_tasks(): _8_1_audio_task.gen_audio_task_main() _8_2_dub_chunks.gen_dub_chunks() ================================================ FILE: config.yaml ================================================ # * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py # recommend to set in streamlit page # ------------------- # version: "3.0.0" # author: "Huanshere" # ------------------- ## ======================== Basic Settings ======================== ## display_language: "zh-CN" # API settings api: key: 'your-api-key' base_url: 'https://yunwu.ai' model: 'gpt-4.1-2025-04-14' llm_support_json: false # *Number of LLM multi-threaded accesses, set to 1 if using local LLM max_workers: 4 # Language settings, written into the prompt, can be described in natural language target_language: '简体中文' # Whether to use Demucs for vocal separation before transcription demucs: true whisper: # ["large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3 model: 'large-v3' # Whisper specified recognition language ISO 639-1 language: 'en' detected_language: 'en' # Whisper running mode ["local", "cloud", "elevenlabs"]. Specifies where to run, cloud uses 302.ai API runtime: 'local' # 302.ai API key whisperX_302_api_key: 'your_302_api_key' # ElevenLabs API key (experimental) elevenlabs_api_key: 'your_elevenlabs_api_key' # Whether to burn subtitles into the video burn_subtitles: true ## ======================== Advanced Settings ======================== ## # *🔬 h264_nvenc GPU acceleration for ffmpeg, make sure your GPU supports it ffmpeg_gpu: false # *Youtube settings youtube: cookies_path: '' # *Default resolution for downloading YouTube videos [360, 1080, best] ytb_resolution: '1080' subtitle: # *Maximum length of each subtitle line in characters max_length: 75 # *Translated subtitles are slightly larger than source subtitles, affecting the reference length for subtitle splitting target_multiplier: 1.2 # *Summary length, set low to 2k if using local LLM summary_length: 8000 # *Maximum number of words for the first rough cut, below 18 will cut too finely affecting translation, above 22 is too long and will make subsequent subtitle splitting difficult to align max_split_length: 20 # *Whether to reflect the translation result in the original text reflect_translate: true # *Whether to pause after extracting professional terms and before translation, allowing users to manually adjust the terminology table output\log\terminology.json pause_before_translate: false ## ======================== Dubbing Settings ======================== ## # TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts] tts_method: 'azure_tts' # SiliconFlow FishTTS sf_fish_tts: # SiliconFlow API key api_key: 'YOUR_API_KEY' # only for mode "preset" voice: 'anna' # *only for mode "custom", dont set manually custom_name: '' voice_id: '' # preset, custom, dynamic mode: "preset" # OpenAI TTS-1 API configuration, 302.ai API only openai_tts: api_key: 'YOUR_302_API_KEY' voice: 'alloy' # Azure configuration, 302.ai API only azure_tts: api_key: 'YOUR_302_API_KEY' voice: 'zh-CN-YunfengNeural' # FishTTS configuration, 302.ai API only fish_tts: api_key: 'YOUR_302_API_KEY' character: 'AD学姐' character_id_dict: 'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1' '丁真': '54a5170264694bfc8e9ad98df7bd89c3' # SiliconFlow CosyVoice2 Clone sf_cosyvoice2: api_key: 'YOUR_SF_KEY' # Edge TTS configuration edge_tts: voice: 'zh-CN-XiaoxiaoNeural' # SoVITS configuration gpt_sovits: character: 'Huanyuv2' refer_mode: 3 f5tts: 302_api: 'YOUR_302_API_KEY' # *Audio speed range speed_factor: min: 1 accept: 1.2 # Maximum acceptable speed max: 1.4 # *Merge audio configuration min_subtitle_duration: 2.5 # Minimum subtitle duration, will be forcibly extended min_trim_duration: 3.5 # Subtitles shorter than this value won't be split tolerance: 1.5 # Allowed extension time to the next subtitle ## ======================== Additional settings ======================== ## # Whisper model directory model_dir: './_model_cache' # Supported upload video formats allowed_video_formats: - 'mp4' - 'mov' - 'avi' - 'mkv' - 'flv' - 'wmv' - 'webm' allowed_audio_formats: - 'wav' - 'mp3' - 'flac' - 'm4a' # Spacy models spacy_model_map: en: 'en_core_web_md' ru: 'ru_core_news_md' fr: 'fr_core_news_md' ja: 'ja_core_news_md' es: 'es_core_news_md' de: 'de_core_news_md' it: 'it_core_news_md' zh: 'zh_core_web_md' # Languages that use space as separator language_split_with_space: - 'en' - 'es' - 'fr' - 'de' - 'it' - 'ru' # Languages that do not use space as separator language_split_without_space: - 'zh' - 'ja' ================================================ FILE: core/_10_gen_audio.py ================================================ import os import time import shutil import subprocess from typing import Tuple import pandas as pd from pydub import AudioSegment from rich.console import Console from rich.progress import Progress from concurrent.futures import ThreadPoolExecutor, as_completed from core.utils import * from core.utils.models import * from core.asr_backend.audio_preprocess import get_audio_duration from core.tts_backend.tts_main import tts_main console = Console() TEMP_FILE_TEMPLATE = f"{_AUDIO_TMP_DIR}/{{}}_temp.wav" OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav" WARMUP_SIZE = 5 def parse_df_srt_time(time_str: str) -> float: """Convert SRT time format to seconds""" hours, minutes, seconds = time_str.strip().split(':') seconds, milliseconds = seconds.split('.') return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000 def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -> None: """Adjust audio speed and handle edge cases""" # If the speed factor is close to 1, directly copy the file if abs(speed_factor - 1.0) < 0.001: shutil.copy2(input_file, output_file) return atempo = speed_factor cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file] input_duration = get_audio_duration(input_file) max_retries = 2 for attempt in range(max_retries): try: subprocess.run(cmd, check=True, stderr=subprocess.PIPE) output_duration = get_audio_duration(output_file) expected_duration = input_duration / speed_factor diff = output_duration - expected_duration # If the output duration exceeds the expected duration, but the input audio is less than 3 seconds, and the error is within 0.1 seconds, truncate to the expected length if output_duration >= expected_duration * 1.02 and input_duration < 3 and diff <= 0.1: audio = AudioSegment.from_wav(output_file) trimmed_audio = audio[:(expected_duration * 1000)] # pydub uses milliseconds trimmed_audio.export(output_file, format="wav") print(f"✂️ Trimmed to expected duration: {expected_duration:.2f} seconds") return elif output_duration >= expected_duration * 1.02: raise Exception(f"Audio duration abnormal: input file={input_file}, output file={output_file}, speed factor={speed_factor}, input duration={input_duration:.2f}s, output duration={output_duration:.2f}s") return except subprocess.CalledProcessError as e: if attempt < max_retries - 1: rprint(f"[yellow]⚠️ Audio speed adjustment failed, retrying in 1s ({attempt + 1}/{max_retries})[/yellow]") time.sleep(1) else: rprint(f"[red]❌ Audio speed adjustment failed, max retries reached ({max_retries})[/red]") raise e def process_row(row: pd.Series, tasks_df: pd.DataFrame) -> Tuple[int, float]: """Helper function for processing single row data""" number = row['number'] lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines'] real_dur = 0 for line_index, line in enumerate(lines): temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}") tts_main(line, temp_file, number, tasks_df) real_dur += get_audio_duration(temp_file) return number, real_dur def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame: """Generate TTS audio sequentially and calculate actual duration""" tasks_df['real_dur'] = 0 rprint("[bold green]🎯 Starting TTS audio generation...[/bold green]") with Progress() as progress: task = progress.add_task("[cyan]🔄 Generating TTS audio...", total=len(tasks_df)) # warm up for first 5 rows warmup_size = min(WARMUP_SIZE, len(tasks_df)) for _, row in tasks_df.head(warmup_size).iterrows(): try: number, real_dur = process_row(row, tasks_df) tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur progress.advance(task) except Exception as e: rprint(f"[red]❌ Error in warmup: {str(e)}[/red]") raise e # for gpt_sovits, do not use parallel to avoid mistakes max_workers = load_key("max_workers") if load_key("tts_method") != "gpt_sovits" else 1 # parallel processing for remaining tasks if len(tasks_df) > warmup_size: remaining_tasks = tasks_df.iloc[warmup_size:].copy() with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [ executor.submit(process_row, row, tasks_df.copy()) for _, row in remaining_tasks.iterrows() ] for future in as_completed(futures): try: number, real_dur = future.result() tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur progress.advance(task) except Exception as e: rprint(f"[red]❌ Error: {str(e)}[/red]") raise e rprint("[bold green]✨ TTS audio generation completed![/bold green]") return tasks_df def process_chunk(chunk_df: pd.DataFrame, accept: float, min_speed: float) -> tuple[float, bool]: """Process audio chunk and calculate speed factor""" chunk_durs = chunk_df['real_dur'].sum() tol_durs = chunk_df['tol_dur'].sum() durations = tol_durs - chunk_df.iloc[-1]['tolerance'] all_gaps = chunk_df['gap'].sum() - chunk_df.iloc[-1]['gap'] keep_gaps = True speed_var_error = 0.1 if (chunk_durs + all_gaps) / accept < durations: speed_factor = max(min_speed, (chunk_durs + all_gaps) / (durations-speed_var_error)) elif chunk_durs / accept < durations: speed_factor = max(min_speed, chunk_durs / (durations-speed_var_error)) keep_gaps = False elif (chunk_durs + all_gaps) / accept < tol_durs: speed_factor = max(min_speed, (chunk_durs + all_gaps) / (tol_durs-speed_var_error)) else: speed_factor = chunk_durs / (tol_durs-speed_var_error) keep_gaps = False return round(speed_factor, 3), keep_gaps def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame: """Merge audio chunks and adjust timeline""" rprint("[bold blue]🔄 Starting audio chunks processing...[/bold blue]") accept = load_key("speed_factor.accept") min_speed = load_key("speed_factor.min") chunk_start = 0 tasks_df['new_sub_times'] = None for index, row in tasks_df.iterrows(): if row['cut_off'] == 1: chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True) speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed) # 🎯 Step1: Start processing new timeline chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time']) chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance'] # 加上tolerance才是这一块的结束 cur_time = chunk_start_time for i, row in chunk_df.iterrows(): # If i is not 0, which is not the first row of the chunk, cur_time needs to be added with the gap of the previous row, remember to divide by speed_factor if i != 0 and keep_gaps: cur_time += chunk_df.iloc[i-1]['gap']/speed_factor new_sub_times = [] number = row['number'] lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines'] for line_index, line in enumerate(lines): # 🔄 Step2: Start speed change and save as OUTPUT_FILE_TEMPLATE temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}") output_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}") adjust_audio_speed(temp_file, output_file, speed_factor) ad_dur = get_audio_duration(output_file) new_sub_times.append([cur_time, cur_time+ad_dur]) cur_time += ad_dur # 🔄 Step3: Find corresponding main DataFrame index and update new_sub_times main_df_idx = tasks_df[tasks_df['number'] == row['number']].index[0] tasks_df.at[main_df_idx, 'new_sub_times'] = new_sub_times # 🎯 Step4: Choose emoji based on speed_factor and accept comparison emoji = "⚡" if speed_factor <= accept else "⚠️" rprint(f"[cyan]{emoji} Processed chunk {chunk_start} to {index} with speed factor {speed_factor}[/cyan]") # 🔄 Step5: Check if the last row exceeds the range if cur_time > chunk_end_time: time_diff = cur_time - chunk_end_time if time_diff <= 0.6: # If exceeding time is within 0.6 seconds, truncate the last audio rprint(f"[yellow]⚠️ Chunk {chunk_start} to {index} exceeds by {time_diff:.3f}s, truncating last audio[/yellow]") # Get the last audio file last_number = tasks_df.iloc[index]['number'] last_lines = eval(tasks_df.iloc[index]['lines']) if isinstance(tasks_df.iloc[index]['lines'], str) else tasks_df.iloc[index]['lines'] last_line_index = len(last_lines) - 1 last_file = OUTPUT_FILE_TEMPLATE.format(f"{last_number}_{last_line_index}") # Calculate the duration to keep audio = AudioSegment.from_wav(last_file) original_duration = len(audio) / 1000 # Convert to seconds new_duration = original_duration - time_diff trimmed_audio = audio[:(new_duration * 1000)] # pydub uses milliseconds trimmed_audio.export(last_file, format="wav") # Update the last timestamp last_times = tasks_df.at[index, 'new_sub_times'] last_times[-1][1] = chunk_end_time tasks_df.at[index, 'new_sub_times'] = last_times else: raise Exception(f"Chunk {chunk_start} to {index} exceeds the chunk end time {chunk_end_time:.2f} seconds with current time {cur_time:.2f} seconds") chunk_start = index+1 rprint("[bold green]✅ Audio chunks processing completed![/bold green]") return tasks_df def gen_audio() -> None: """Main function: Generate audio and process timeline""" rprint("[bold magenta]🚀 Starting audio generation process...[/bold magenta]") # 🎯 Step1: Create necessary directories os.makedirs(_AUDIO_TMP_DIR, exist_ok=True) os.makedirs(_AUDIO_SEGS_DIR, exist_ok=True) # 📝 Step2: Load task file tasks_df = pd.read_excel(_8_1_AUDIO_TASK) rprint("[green]📊 Loaded task file successfully[/green]") # 🔊 Step3: Generate TTS audio tasks_df = generate_tts_audio(tasks_df) # 🔄 Step4: Merge audio chunks tasks_df = merge_chunks(tasks_df) # 💾 Step5: Save results tasks_df.to_excel(_8_1_AUDIO_TASK, index=False) rprint("[bold green]🎉 Audio generation completed successfully![/bold green]") if __name__ == "__main__": gen_audio() ================================================ FILE: core/_11_merge_audio.py ================================================ import os import pandas as pd import subprocess from pydub import AudioSegment from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn from rich.console import Console from core.utils import * from core.utils.models import * console = Console() DUB_VOCAL_FILE = 'output/dub.mp3' DUB_SUB_FILE = 'output/dub.srt' OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav" def load_and_flatten_data(excel_file): """Load and flatten Excel data""" df = pd.read_excel(excel_file) lines = [eval(line) if isinstance(line, str) else line for line in df['lines'].tolist()] lines = [item for sublist in lines for item in sublist] new_sub_times = [eval(time) if isinstance(time, str) else time for time in df['new_sub_times'].tolist()] new_sub_times = [item for sublist in new_sub_times for item in sublist] return df, lines, new_sub_times def get_audio_files(df): """Generate a list of audio file paths""" audios = [] for index, row in df.iterrows(): number = row['number'] line_count = len(eval(row['lines']) if isinstance(row['lines'], str) else row['lines']) for line_index in range(line_count): temp_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}") audios.append(temp_file) return audios def process_audio_segment(audio_file): """Process a single audio segment with MP3 compression""" temp_file = f"{audio_file}_temp.mp3" ffmpeg_cmd = [ 'ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-b:a', '64k', temp_file ] subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) audio_segment = AudioSegment.from_mp3(temp_file) os.remove(temp_file) return audio_segment def merge_audio_segments(audios, new_sub_times, sample_rate): merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate) with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn()) as progress: merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios)) for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)): if not os.path.exists(audio_file): console.print(f"[bold yellow]⚠️ Warning: File {audio_file} does not exist, skipping...[/bold yellow]") progress.advance(merge_task) continue audio_segment = process_audio_segment(audio_file) start_time, end_time = time_range # Add silence segment if i > 0: prev_end = new_sub_times[i-1][1] silence_duration = start_time - prev_end if silence_duration > 0: silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate) merged_audio += silence elif start_time > 0: silence = AudioSegment.silent(duration=int(start_time * 1000), frame_rate=sample_rate) merged_audio += silence merged_audio += audio_segment progress.advance(merge_task) return merged_audio def create_srt_subtitle(): df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK) with open(DUB_SUB_FILE, 'w', encoding='utf-8') as f: for i, ((start_time, end_time), line) in enumerate(zip(new_sub_times, lines), 1): start_str = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time*1000)%1000):03d}" end_str = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time*1000)%1000):03d}" f.write(f"{i}\n") f.write(f"{start_str} --> {end_str}\n") f.write(f"{line}\n\n") rprint(f"[bold green]✅ Subtitle file created: {DUB_SUB_FILE}[/bold green]") def merge_full_audio(): """Main function: Process the complete audio merging process""" console.print("\n[bold cyan]🎬 Starting audio merging process...[/bold cyan]") with console.status("[bold cyan]📊 Loading data from Excel...[/bold cyan]"): df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK) console.print("[bold green]✅ Data loaded successfully[/bold green]") with console.status("[bold cyan]🔍 Getting audio file list...[/bold cyan]"): audios = get_audio_files(df) console.print(f"[bold green]✅ Found {len(audios)} audio segments[/bold green]") with console.status("[bold cyan]📝 Generating subtitle file...[/bold cyan]"): create_srt_subtitle() if not os.path.exists(audios[0]): console.print(f"[bold red]❌ Error: First audio file {audios[0]} does not exist![/bold red]") return sample_rate = 16000 console.print(f"[bold green]✅ Sample rate: {sample_rate}Hz[/bold green]") console.print("[bold cyan]🔄 Starting audio merge process...[/bold cyan]") merged_audio = merge_audio_segments(audios, new_sub_times, sample_rate) with console.status("[bold cyan]💾 Exporting final audio file...[/bold cyan]"): merged_audio = merged_audio.set_frame_rate(16000).set_channels(1) merged_audio.export(DUB_VOCAL_FILE, format="mp3", parameters=["-b:a", "64k"]) console.print(f"[bold green]✅ Audio file successfully merged![/bold green]") console.print(f"[bold green]📁 Output file: {DUB_VOCAL_FILE}[/bold green]") if __name__ == "__main__": merge_full_audio() ================================================ FILE: core/_12_dub_to_vid.py ================================================ import platform import subprocess import cv2 import numpy as np from rich.console import Console from core._1_ytdlp import find_video_files from core.asr_backend.audio_preprocess import normalize_audio_volume from core.utils import * from core.utils.models import * console = Console() DUB_VIDEO = "output/output_dub.mp4" DUB_SUB_FILE = 'output/dub.srt' DUB_AUDIO = 'output/dub.mp3' TRANS_FONT_SIZE = 17 TRANS_FONT_NAME = 'Arial' if platform.system() == 'Linux': TRANS_FONT_NAME = 'NotoSansCJK-Regular' if platform.system() == 'Darwin': TRANS_FONT_NAME = 'Arial Unicode MS' TRANS_FONT_COLOR = '&H00FFFF' TRANS_OUTLINE_COLOR = '&H000000' TRANS_OUTLINE_WIDTH = 1 TRANS_BACK_COLOR = '&H33000000' def merge_video_audio(): """Merge video and audio, and reduce video volume""" VIDEO_FILE = find_video_files() background_file = _BACKGROUND_AUDIO_FILE if not load_key("burn_subtitles"): rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as subtitles are not burned in.[/bold yellow]") # Create a black frame frame = np.zeros((1080, 1920, 3), dtype=np.uint8) fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(DUB_VIDEO, fourcc, 1, (1920, 1080)) out.write(frame) out.release() rprint("[bold green]Placeholder video has been generated.[/bold green]") return # Normalize dub audio normalized_dub_audio = 'output/normalized_dub.wav' normalize_audio_volume(DUB_AUDIO, normalized_dub_audio) # Merge video and audio with translated subtitles video = cv2.VideoCapture(VIDEO_FILE) TARGET_WIDTH = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) TARGET_HEIGHT = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) video.release() rprint(f"[bold green]Video resolution: {TARGET_WIDTH}x{TARGET_HEIGHT}[/bold green]") subtitle_filter = ( f"subtitles={DUB_SUB_FILE}:force_style='FontSize={TRANS_FONT_SIZE}," f"FontName={TRANS_FONT_NAME},PrimaryColour={TRANS_FONT_COLOR}," f"OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH}," f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'" ) cmd = [ 'ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', normalized_dub_audio, '-filter_complex', f'[0:v]scale={TARGET_WIDTH}:{TARGET_HEIGHT}:force_original_aspect_ratio=decrease,' f'pad={TARGET_WIDTH}:{TARGET_HEIGHT}:(ow-iw)/2:(oh-ih)/2,' f'{subtitle_filter}[v];' f'[1:a][2:a]amix=inputs=2:duration=first:dropout_transition=3[a]' ] if load_key("ffmpeg_gpu"): rprint("[bold green]Using GPU acceleration...[/bold green]") cmd.extend(['-map', '[v]', '-map', '[a]', '-c:v', 'h264_nvenc']) else: cmd.extend(['-map', '[v]', '-map', '[a]']) cmd.extend(['-c:a', 'aac', '-b:a', '96k', DUB_VIDEO]) subprocess.run(cmd) rprint(f"[bold green]Video and audio successfully merged into {DUB_VIDEO}[/bold green]") if __name__ == '__main__': merge_video_audio() ================================================ FILE: core/_1_ytdlp.py ================================================ import os,sys import glob import re import subprocess from core.utils import * def sanitize_filename(filename): # Remove or replace illegal characters filename = re.sub(r'[<>:"/\\|?*]', '', filename) # Ensure filename doesn't start or end with a dot or space filename = filename.strip('. ') # Use default name if filename is empty return filename if filename else 'video' def update_ytdlp(): try: subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "yt-dlp"]) if 'yt_dlp' in sys.modules: del sys.modules['yt_dlp'] rprint("[green]yt-dlp updated[/green]") except subprocess.CalledProcessError as e: rprint("[yellow]Warning: Failed to update yt-dlp: {e}[/yellow]") from yt_dlp import YoutubeDL return YoutubeDL def download_video_ytdlp(url, save_path='output', resolution='1080'): os.makedirs(save_path, exist_ok=True) ydl_opts = { 'format': 'bestvideo+bestaudio/best' if resolution == 'best' else f'bestvideo[height<={resolution}]+bestaudio/best[height<={resolution}]', 'outtmpl': f'{save_path}/%(title)s.%(ext)s', 'noplaylist': True, 'writethumbnail': True, 'postprocessors': [{'key': 'FFmpegThumbnailsConvertor', 'format': 'jpg'}], } # Read Youtube Cookie File cookies_path = load_key("youtube.cookies_path") if os.path.exists(cookies_path): ydl_opts["cookiefile"] = str(cookies_path) # Get YoutubeDL class after updating YoutubeDL = update_ytdlp() with YoutubeDL(ydl_opts) as ydl: ydl.download([url]) # Check and rename files after download for file in os.listdir(save_path): if os.path.isfile(os.path.join(save_path, file)): filename, ext = os.path.splitext(file) new_filename = sanitize_filename(filename) if new_filename != filename: os.rename(os.path.join(save_path, file), os.path.join(save_path, new_filename + ext)) def find_video_files(save_path='output'): video_files = [file for file in glob.glob(save_path + "/*") if os.path.splitext(file)[1][1:].lower() in load_key("allowed_video_formats")] # change \\ to /, this happen on windows if sys.platform.startswith('win'): video_files = [file.replace("\\", "/") for file in video_files] video_files = [file for file in video_files if not file.startswith("output/output")] if len(video_files) != 1: raise ValueError(f"Number of videos found {len(video_files)} is not unique. Please check.") return video_files[0] if __name__ == '__main__': # Example usage url = input('Please enter the URL of the video you want to download: ') resolution = input('Please enter the desired resolution (360/480/720/1080, default 1080): ') resolution = int(resolution) if resolution.isdigit() else 1080 download_video_ytdlp(url, resolution=resolution) print(f"🎥 Video has been downloaded to {find_video_files()}") ================================================ FILE: core/_2_asr.py ================================================ from core.utils import * from core.asr_backend.demucs_vl import demucs_audio from core.asr_backend.audio_preprocess import process_transcription, convert_video_to_audio, split_audio, save_results, normalize_audio_volume from core._1_ytdlp import find_video_files from core.utils.models import * @check_file_exists(_2_CLEANED_CHUNKS) def transcribe(): # 1. video to audio video_file = find_video_files() convert_video_to_audio(video_file) # 2. Demucs vocal separation: if load_key("demucs"): demucs_audio() vocal_audio = normalize_audio_volume(_VOCAL_AUDIO_FILE, _VOCAL_AUDIO_FILE, format="mp3") else: vocal_audio = _RAW_AUDIO_FILE # 3. Extract audio segments = split_audio(_RAW_AUDIO_FILE) # 4. Transcribe audio by clips all_results = [] runtime = load_key("whisper.runtime") if runtime == "local": from core.asr_backend.whisperX_local import transcribe_audio as ts rprint("[cyan]🎤 Transcribing audio with local model...[/cyan]") elif runtime == "cloud": from core.asr_backend.whisperX_302 import transcribe_audio_302 as ts rprint("[cyan]🎤 Transcribing audio with 302 API...[/cyan]") elif runtime == "elevenlabs": from core.asr_backend.elevenlabs_asr import transcribe_audio_elevenlabs as ts rprint("[cyan]🎤 Transcribing audio with ElevenLabs API...[/cyan]") for start, end in segments: result = ts(_RAW_AUDIO_FILE, vocal_audio, start, end) all_results.append(result) # 5. Combine results combined_result = {'segments': []} for result in all_results: combined_result['segments'].extend(result['segments']) # 6. Process df df = process_transcription(combined_result) save_results(df) if __name__ == "__main__": transcribe() ================================================ FILE: core/_3_1_split_nlp.py ================================================ from core.spacy_utils import * from core.utils.models import _3_1_SPLIT_BY_NLP from core.utils import check_file_exists @check_file_exists(_3_1_SPLIT_BY_NLP) def split_by_spacy(): nlp = init_nlp() split_by_mark(nlp) split_by_comma_main(nlp) split_sentences_main(nlp) split_long_by_root_main(nlp) return if __name__ == '__main__': split_by_spacy() ================================================ FILE: core/_3_2_split_meaning.py ================================================ import concurrent.futures from difflib import SequenceMatcher import math from core.prompts import get_split_prompt from core.spacy_utils.load_nlp_model import init_nlp from core.utils import * from rich.console import Console from rich.table import Table from core.utils.models import _3_1_SPLIT_BY_NLP, _3_2_SPLIT_BY_MEANING console = Console() def tokenize_sentence(sentence, nlp): doc = nlp(sentence) return [token.text for token in doc] def find_split_positions(original, modified): split_positions = [] parts = modified.split('[br]') start = 0 whisper_language = load_key("whisper.language") language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language joiner = get_joiner(language) for i in range(len(parts) - 1): max_similarity = 0 best_split = None for j in range(start, len(original)): original_left = original[start:j] modified_left = joiner.join(parts[i].split()) left_similarity = SequenceMatcher(None, original_left, modified_left).ratio() if left_similarity > max_similarity: max_similarity = left_similarity best_split = j if max_similarity < 0.9: console.print(f"[yellow]Warning: low similarity found at the best split point: {max_similarity}[/yellow]") if best_split is not None: split_positions.append(best_split) start = best_split else: console.print(f"[yellow]Warning: Unable to find a suitable split point for the {i+1}th part.[/yellow]") return split_positions def split_sentence(sentence, num_parts, word_limit=20, index=-1, retry_attempt=0): """Split a long sentence using GPT and return the result as a string.""" split_prompt = get_split_prompt(sentence, num_parts, word_limit) def valid_split(response_data): choice = response_data["choice"] if f'split{choice}' not in response_data: return {"status": "error", "message": "Missing required key: `split`"} if "[br]" not in response_data[f"split{choice}"]: return {"status": "error", "message": "Split failed, no [br] found"} return {"status": "success", "message": "Split completed"} response_data = ask_gpt(split_prompt + " " * retry_attempt, resp_type='json', valid_def=valid_split, log_title='split_by_meaning') choice = response_data["choice"] best_split = response_data[f"split{choice}"] split_points = find_split_positions(sentence, best_split) # split the sentence based on the split points for i, split_point in enumerate(split_points): if i == 0: best_split = sentence[:split_point] + '\n' + sentence[split_point:] else: parts = best_split.split('\n') last_part = parts[-1] parts[-1] = last_part[:split_point - split_points[i-1]] + '\n' + last_part[split_point - split_points[i-1]:] best_split = '\n'.join(parts) if index != -1: console.print(f'[green]✅ Sentence {index} has been successfully split[/green]') table = Table(title="") table.add_column("Type", style="cyan") table.add_column("Sentence") table.add_row("Original", sentence, style="yellow") table.add_row("Split", best_split.replace('\n', ' ||'), style="yellow") console.print(table) return best_split def parallel_split_sentences(sentences, max_length, max_workers, nlp, retry_attempt=0): """Split sentences in parallel using a thread pool.""" new_sentences = [None] * len(sentences) futures = [] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: for index, sentence in enumerate(sentences): # Use tokenizer to split the sentence tokens = tokenize_sentence(sentence, nlp) # print("Tokenization result:", tokens) num_parts = math.ceil(len(tokens) / max_length) if len(tokens) > max_length: future = executor.submit(split_sentence, sentence, num_parts, max_length, index=index, retry_attempt=retry_attempt) futures.append((future, index, num_parts, sentence)) else: new_sentences[index] = [sentence] for future, index, num_parts, sentence in futures: split_result = future.result() if split_result: split_lines = split_result.strip().split('\n') new_sentences[index] = [line.strip() for line in split_lines] else: new_sentences[index] = [sentence] return [sentence for sublist in new_sentences for sentence in sublist] @check_file_exists(_3_2_SPLIT_BY_MEANING) def split_sentences_by_meaning(): """The main function to split sentences by meaning.""" # read input sentences with open(_3_1_SPLIT_BY_NLP, 'r', encoding='utf-8') as f: sentences = [line.strip() for line in f.readlines()] nlp = init_nlp() # 🔄 process sentences multiple times to ensure all are split for retry_attempt in range(3): sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=load_key("max_workers"), nlp=nlp, retry_attempt=retry_attempt) # 💾 save results with open(_3_2_SPLIT_BY_MEANING, 'w', encoding='utf-8') as f: f.write('\n'.join(sentences)) console.print('[green]✅ All sentences have been successfully split![/green]') if __name__ == '__main__': # print(split_sentence('Which makes no sense to the... average guy who always pushes the character creation slider all the way to the right.', 2, 22)) split_sentences_by_meaning() ================================================ FILE: core/_4_1_summarize.py ================================================ import json from core.prompts import get_summary_prompt import pandas as pd from core.utils import * from core.utils.models import _3_2_SPLIT_BY_MEANING, _4_1_TERMINOLOGY CUSTOM_TERMS_PATH = 'custom_terms.xlsx' def combine_chunks(): """Combine the text chunks identified by whisper into a single long text""" with open(_3_2_SPLIT_BY_MEANING, 'r', encoding='utf-8') as file: sentences = file.readlines() cleaned_sentences = [line.strip() for line in sentences] combined_text = ' '.join(cleaned_sentences) return combined_text[:load_key('summary_length')] #! Return only the first x characters def search_things_to_note_in_prompt(sentence): """Search for terms to note in the given sentence""" with open(_4_1_TERMINOLOGY, 'r', encoding='utf-8') as file: things_to_note = json.load(file) things_to_note_list = [term['src'] for term in things_to_note['terms'] if term['src'].lower() in sentence.lower()] if things_to_note_list: prompt = '\n'.join( f'{i+1}. "{term["src"]}": "{term["tgt"]}",' f' meaning: {term["note"]}' for i, term in enumerate(things_to_note['terms']) if term['src'] in things_to_note_list ) return prompt else: return None def get_summary(): src_content = combine_chunks() custom_terms = pd.read_excel(CUSTOM_TERMS_PATH) custom_terms_json = { "terms": [ { "src": str(row.iloc[0]), "tgt": str(row.iloc[1]), "note": str(row.iloc[2]) } for _, row in custom_terms.iterrows() ] } if len(custom_terms) > 0: rprint(f"📖 Custom Terms Loaded: {len(custom_terms)} terms") rprint("📝 Terms Content:", json.dumps(custom_terms_json, indent=2, ensure_ascii=False)) summary_prompt = get_summary_prompt(src_content, custom_terms_json) rprint("📝 Summarizing and extracting terminology ...") def valid_summary(response_data): required_keys = {'src', 'tgt', 'note'} if 'terms' not in response_data: return {"status": "error", "message": "Invalid response format"} for term in response_data['terms']: if not all(key in term for key in required_keys): return {"status": "error", "message": "Invalid response format"} return {"status": "success", "message": "Summary completed"} summary = ask_gpt(summary_prompt, resp_type='json', valid_def=valid_summary, log_title='summary') summary['terms'].extend(custom_terms_json['terms']) with open(_4_1_TERMINOLOGY, 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=4) rprint(f'💾 Summary log saved to → `{_4_1_TERMINOLOGY}`') if __name__ == '__main__': get_summary() ================================================ FILE: core/_4_2_translate.py ================================================ import pandas as pd import json import concurrent.futures from core.translate_lines import translate_lines from core._4_1_summarize import search_things_to_note_in_prompt from core._8_1_audio_task import check_len_then_trim from core._6_gen_sub import align_timestamp from core.utils import * from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn from difflib import SequenceMatcher from core.utils.models import * console = Console() # Function to split text into chunks def split_chunks_by_chars(chunk_size, max_i): """Split text into chunks based on character count, return a list of multi-line text chunks""" with open(_3_2_SPLIT_BY_MEANING, "r", encoding="utf-8") as file: sentences = file.read().strip().split('\n') chunks = [] chunk = '' sentence_count = 0 for sentence in sentences: if len(chunk) + len(sentence + '\n') > chunk_size or sentence_count == max_i: chunks.append(chunk.strip()) chunk = sentence + '\n' sentence_count = 1 else: chunk += sentence + '\n' sentence_count += 1 chunks.append(chunk.strip()) return chunks # Get context from surrounding chunks def get_previous_content(chunks, chunk_index): return None if chunk_index == 0 else chunks[chunk_index - 1].split('\n')[-3:] # Get last 3 lines def get_after_content(chunks, chunk_index): return None if chunk_index == len(chunks) - 1 else chunks[chunk_index + 1].split('\n')[:2] # Get first 2 lines # 🔍 Translate a single chunk def translate_chunk(chunk, chunks, theme_prompt, i): things_to_note_prompt = search_things_to_note_in_prompt(chunk) previous_content_prompt = get_previous_content(chunks, i) after_content_prompt = get_after_content(chunks, i) translation, english_result = translate_lines(chunk, previous_content_prompt, after_content_prompt, things_to_note_prompt, theme_prompt, i) return i, english_result, translation # Add similarity calculation function def similar(a, b): return SequenceMatcher(None, a, b).ratio() # 🚀 Main function to translate all chunks @check_file_exists(_4_2_TRANSLATION) def translate_all(): console.print("[bold green]Start Translating All...[/bold green]") chunks = split_chunks_by_chars(chunk_size=600, max_i=10) with open(_4_1_TERMINOLOGY, 'r', encoding='utf-8') as file: theme_prompt = json.load(file).get('theme') # 🔄 Use concurrent execution for translation with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) as progress: task = progress.add_task("[cyan]Translating chunks...", total=len(chunks)) with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: futures = [] for i, chunk in enumerate(chunks): future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i) futures.append(future) results = [] for future in concurrent.futures.as_completed(futures): results.append(future.result()) progress.update(task, advance=1) results.sort(key=lambda x: x[0]) # Sort results based on original order # 💾 Save results to lists and Excel file src_text, trans_text = [], [] for i, chunk in enumerate(chunks): chunk_lines = chunk.split('\n') src_text.extend(chunk_lines) # Calculate similarity between current chunk and translation results chunk_text = ''.join(chunk_lines).lower() matching_results = [(r, similar(''.join(r[1].split('\n')).lower(), chunk_text)) for r in results] best_match = max(matching_results, key=lambda x: x[1]) # Check similarity and handle exceptions if best_match[1] < 0.9: console.print(f"[yellow]Warning: No matching translation found for chunk {i}[/yellow]") raise ValueError(f"Translation matching failed (chunk {i})") elif best_match[1] < 1.0: console.print(f"[yellow]Warning: Similar match found (chunk {i}, similarity: {best_match[1]:.3f})[/yellow]") trans_text.extend(best_match[0][2].split('\n')) # Trim long translation text df_text = pd.read_excel(_2_CLEANED_CHUNKS) df_text['text'] = df_text['text'].str.strip('"').str.strip() df_translate = pd.DataFrame({'Source': src_text, 'Translation': trans_text}) subtitle_output_configs = [('trans_subs_for_audio.srt', ['Translation'])] df_time = align_timestamp(df_text, df_translate, subtitle_output_configs, output_dir=None, for_display=False) console.print(df_time) # apply check_len_then_trim to df_time['Translation'], only when duration > MIN_TRIM_DURATION. df_time['Translation'] = df_time.apply(lambda x: check_len_then_trim(x['Translation'], x['duration']) if x['duration'] > load_key("min_trim_duration") else x['Translation'], axis=1) console.print(df_time) df_time.to_excel(_4_2_TRANSLATION, index=False) console.print("[bold green]✅ Translation completed and results saved.[/bold green]") if __name__ == '__main__': translate_all() ================================================ FILE: core/_5_split_sub.py ================================================ import pandas as pd from typing import List, Tuple import concurrent.futures from core._3_2_split_meaning import split_sentence from core.prompts import get_align_prompt from rich.panel import Panel from rich.console import Console from rich.table import Table from core.utils import * from core.utils.models import * console = Console() # ! You can modify your own weights here # Chinese and Japanese 2.5 characters, Korean 2 characters, Thai 1.5 characters, full-width symbols 2 characters, other English-based and half-width symbols 1 character def calc_len(text: str) -> float: text = str(text) # force convert def char_weight(char): code = ord(char) if 0x4E00 <= code <= 0x9FFF or 0x3040 <= code <= 0x30FF: # Chinese and Japanese return 1.75 elif 0xAC00 <= code <= 0xD7A3 or 0x1100 <= code <= 0x11FF: # Korean return 1.5 elif 0x0E00 <= code <= 0x0E7F: # Thai return 1 elif 0xFF01 <= code <= 0xFF5E: # full-width symbols return 1.75 else: # other characters (e.g. English and half-width symbols) return 1 return sum(char_weight(char) for char in text) def align_subs(src_sub: str, tr_sub: str, src_part: str) -> Tuple[List[str], List[str], str]: align_prompt = get_align_prompt(src_sub, tr_sub, src_part) def valid_align(response_data): if 'align' not in response_data: return {"status": "error", "message": "Missing required key: `align`"} if len(response_data['align']) < 2: return {"status": "error", "message": "Align does not contain more than 1 part as expected!"} return {"status": "success", "message": "Align completed"} parsed = ask_gpt(align_prompt, resp_type='json', valid_def=valid_align, log_title='align_subs') align_data = parsed['align'] src_parts = src_part.split('\n') tr_parts = [item[f'target_part_{i+1}'].strip() for i, item in enumerate(align_data)] whisper_language = load_key("whisper.language") language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language joiner = get_joiner(language) tr_remerged = joiner.join(tr_parts) table = Table(title="🔗 Aligned parts") table.add_column("Language", style="cyan") table.add_column("Parts", style="magenta") table.add_row("SRC_LANG", "\n".join(src_parts)) table.add_row("TARGET_LANG", "\n".join(tr_parts)) console.print(table) return src_parts, tr_parts, tr_remerged def split_align_subs(src_lines: List[str], tr_lines: List[str]): subtitle_set = load_key("subtitle") MAX_SUB_LENGTH = subtitle_set["max_length"] TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"] remerged_tr_lines = tr_lines.copy() to_split = [] for i, (src, tr) in enumerate(zip(src_lines, tr_lines)): src, tr = str(src), str(tr) if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH: to_split.append(i) table = Table(title=f"📏 Line {i} needs to be split") table.add_column("Type", style="cyan") table.add_column("Content", style="magenta") table.add_row("Source Line", src) table.add_row("Target Line", tr) console.print(table) @except_handler("Error in split_align_subs") def process(i): split_src = split_sentence(src_lines[i], num_parts=2).strip() src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src) src_lines[i] = src_parts tr_lines[i] = tr_parts remerged_tr_lines[i] = tr_remerged with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor: executor.map(process, to_split) # Flatten `src_lines` and `tr_lines` src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])] tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])] return src_lines, tr_lines, remerged_tr_lines def split_for_sub_main(): console.print("[bold green]🚀 Start splitting subtitles...[/bold green]") df = pd.read_excel(_4_2_TRANSLATION) src = df['Source'].tolist() trans = df['Translation'].tolist() subtitle_set = load_key("subtitle") MAX_SUB_LENGTH = subtitle_set["max_length"] TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"] for attempt in range(3): # 多次切割 console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False)) split_src, split_trans, remerged = split_align_subs(src.copy(), trans) # 检查是否所有字幕都符合长度要求 if all(len(src) <= MAX_SUB_LENGTH for src in split_src) and \ all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in split_trans): break # 更新源数据继续下一轮分割 src, trans = split_src, split_trans # 确保二者有相同的长度,防止报错 if len(src) > len(remerged): remerged += [None] * (len(src) - len(remerged)) elif len(remerged) > len(src): src += [None] * (len(remerged) - len(src)) pd.DataFrame({'Source': split_src, 'Translation': split_trans}).to_excel(_5_SPLIT_SUB, index=False) pd.DataFrame({'Source': src, 'Translation': remerged}).to_excel(_5_REMERGED, index=False) if __name__ == '__main__': split_for_sub_main() ================================================ FILE: core/_6_gen_sub.py ================================================ import pandas as pd import os import re from rich.panel import Panel from rich.console import Console import autocorrect_py as autocorrect from core.utils import * from core.utils.models import * console = Console() SUBTITLE_OUTPUT_CONFIGS = [ ('src.srt', ['Source']), ('trans.srt', ['Translation']), ('src_trans.srt', ['Source', 'Translation']), ('trans_src.srt', ['Translation', 'Source']) ] AUDIO_SUBTITLE_OUTPUT_CONFIGS = [ ('src_subs_for_audio.srt', ['Source']), ('trans_subs_for_audio.srt', ['Translation']) ] def convert_to_srt_format(start_time, end_time): """Convert time (in seconds) to the format: hours:minutes:seconds,milliseconds""" def seconds_to_hmsm(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) seconds = seconds % 60 milliseconds = int(seconds * 1000) % 1000 return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}" start_srt = seconds_to_hmsm(start_time) end_srt = seconds_to_hmsm(end_time) return f"{start_srt} --> {end_srt}" def remove_punctuation(text): text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s]', '', text) return text.strip() def show_difference(str1, str2): """Show the difference positions between two strings""" min_len = min(len(str1), len(str2)) diff_positions = [] for i in range(min_len): if str1[i] != str2[i]: diff_positions.append(i) if len(str1) != len(str2): diff_positions.extend(range(min_len, max(len(str1), len(str2)))) print("Difference positions:") print(f"Expected sentence: {str1}") print(f"Actual match: {str2}") print("Position markers: " + "".join("^" if i in diff_positions else " " for i in range(max(len(str1), len(str2))))) print(f"Difference indices: {diff_positions}") def get_sentence_timestamps(df_words, df_sentences): time_stamp_list = [] # Build complete string and position mapping full_words_str = '' position_to_word_idx = {} for idx, word in enumerate(df_words['text']): clean_word = remove_punctuation(word.lower()) start_pos = len(full_words_str) full_words_str += clean_word for pos in range(start_pos, len(full_words_str)): position_to_word_idx[pos] = idx current_pos = 0 for idx, sentence in df_sentences['Source'].items(): clean_sentence = remove_punctuation(sentence.lower()).replace(" ", "") sentence_len = len(clean_sentence) match_found = False while current_pos <= len(full_words_str) - sentence_len: if full_words_str[current_pos:current_pos+sentence_len] == clean_sentence: start_word_idx = position_to_word_idx[current_pos] end_word_idx = position_to_word_idx[current_pos + sentence_len - 1] time_stamp_list.append(( float(df_words['start'][start_word_idx]), float(df_words['end'][end_word_idx]) )) current_pos += sentence_len match_found = True break current_pos += 1 if not match_found: print(f"\n⚠️ Warning: No exact match found for sentence: {sentence}") show_difference(clean_sentence, full_words_str[current_pos:current_pos+len(clean_sentence)]) print("\nOriginal sentence:", df_sentences['Source'][idx]) raise ValueError("❎ No match found for sentence.") return time_stamp_list def align_timestamp(df_text, df_translate, subtitle_output_configs: list, output_dir: str, for_display: bool = True): """Align timestamps and add a new timestamp column to df_translate""" df_trans_time = df_translate.copy() # Assign an ID to each word in df_text['text'] and create a new DataFrame words = df_text['text'].str.split(expand=True).stack().reset_index(level=1, drop=True).reset_index() words.columns = ['id', 'word'] words['id'] = words['id'].astype(int) # Process timestamps ⏰ time_stamp_list = get_sentence_timestamps(df_text, df_translate) df_trans_time['timestamp'] = time_stamp_list df_trans_time['duration'] = df_trans_time['timestamp'].apply(lambda x: x[1] - x[0]) # Remove gaps 🕳️ for i in range(len(df_trans_time)-1): delta_time = df_trans_time.loc[i+1, 'timestamp'][0] - df_trans_time.loc[i, 'timestamp'][1] if 0 < delta_time < 1: df_trans_time.at[i, 'timestamp'] = (df_trans_time.loc[i, 'timestamp'][0], df_trans_time.loc[i+1, 'timestamp'][0]) # Convert start and end timestamps to SRT format df_trans_time['timestamp'] = df_trans_time['timestamp'].apply(lambda x: convert_to_srt_format(x[0], x[1])) # Polish subtitles: replace punctuation in Translation if for_display if for_display: df_trans_time['Translation'] = df_trans_time['Translation'].apply(lambda x: re.sub(r'[,。]', ' ', x).strip()) # Output subtitles 📜 def generate_subtitle_string(df, columns): return ''.join([f"{i+1}\n{row['timestamp']}\n{row[columns[0]].strip()}\n{row[columns[1]].strip() if len(columns) > 1 else ''}\n\n" for i, row in df.iterrows()]).strip() if output_dir: os.makedirs(output_dir, exist_ok=True) for filename, columns in subtitle_output_configs: subtitle_str = generate_subtitle_string(df_trans_time, columns) with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f: f.write(subtitle_str) return df_trans_time # ✨ Beautify the translation def clean_translation(x): if pd.isna(x): return '' cleaned = str(x).strip('。').strip(',') return autocorrect.format(cleaned) def align_timestamp_main(): df_text = pd.read_excel(_2_CLEANED_CHUNKS) df_text['text'] = df_text['text'].str.strip('"').str.strip() df_translate = pd.read_excel(_5_SPLIT_SUB) df_translate['Translation'] = df_translate['Translation'].apply(clean_translation) align_timestamp(df_text, df_translate, SUBTITLE_OUTPUT_CONFIGS, _OUTPUT_DIR) console.print(Panel("[bold green]🎉📝 Subtitles generation completed! Please check in the `output` folder 👀[/bold green]")) # for audio df_translate_for_audio = pd.read_excel(_5_REMERGED) # use remerged file to avoid unmatched lines when dubbing df_translate_for_audio['Translation'] = df_translate_for_audio['Translation'].apply(clean_translation) align_timestamp(df_text, df_translate_for_audio, AUDIO_SUBTITLE_OUTPUT_CONFIGS, _AUDIO_DIR) console.print(Panel(f"[bold green]🎉📝 Audio subtitles generation completed! Please check in the `{_AUDIO_DIR}` folder 👀[/bold green]")) if __name__ == '__main__': align_timestamp_main() ================================================ FILE: core/_7_sub_into_vid.py ================================================ import os, subprocess, time from core._1_ytdlp import find_video_files import cv2 import numpy as np import platform from core.utils import * SRC_FONT_SIZE = 15 TRANS_FONT_SIZE = 17 FONT_NAME = 'Arial' TRANS_FONT_NAME = 'Arial' # Linux need to install google noto fonts: apt-get install fonts-noto if platform.system() == 'Linux': FONT_NAME = 'NotoSansCJK-Regular' TRANS_FONT_NAME = 'NotoSansCJK-Regular' # Mac OS has different font names elif platform.system() == 'Darwin': FONT_NAME = 'Arial Unicode MS' TRANS_FONT_NAME = 'Arial Unicode MS' SRC_FONT_COLOR = '&HFFFFFF' SRC_OUTLINE_COLOR = '&H000000' SRC_OUTLINE_WIDTH = 1 SRC_SHADOW_COLOR = '&H80000000' TRANS_FONT_COLOR = '&H00FFFF' TRANS_OUTLINE_COLOR = '&H000000' TRANS_OUTLINE_WIDTH = 1 TRANS_BACK_COLOR = '&H33000000' OUTPUT_DIR = "output" OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_sub.mp4" SRC_SRT = f"{OUTPUT_DIR}/src.srt" TRANS_SRT = f"{OUTPUT_DIR}/trans.srt" def check_gpu_available(): try: result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True) return 'h264_nvenc' in result.stdout except: return False def merge_subtitles_to_video(): video_file = find_video_files() os.makedirs(os.path.dirname(OUTPUT_VIDEO), exist_ok=True) # Check resolution if not load_key("burn_subtitles"): rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as subtitles are not burned in.[/bold yellow]") # Create a black frame frame = np.zeros((1080, 1920, 3), dtype=np.uint8) fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 1, (1920, 1080)) out.write(frame) out.release() rprint("[bold green]Placeholder video has been generated.[/bold green]") return if not os.path.exists(SRC_SRT) or not os.path.exists(TRANS_SRT): rprint("Subtitle files not found in the 'output' directory.") exit(1) video = cv2.VideoCapture(video_file) TARGET_WIDTH = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) TARGET_HEIGHT = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) video.release() rprint(f"[bold green]Video resolution: {TARGET_WIDTH}x{TARGET_HEIGHT}[/bold green]") ffmpeg_cmd = [ 'ffmpeg', '-i', video_file, '-vf', ( f"scale={TARGET_WIDTH}:{TARGET_HEIGHT}:force_original_aspect_ratio=decrease," f"pad={TARGET_WIDTH}:{TARGET_HEIGHT}:(ow-iw)/2:(oh-ih)/2," f"subtitles={SRC_SRT}:force_style='FontSize={SRC_FONT_SIZE},FontName={FONT_NAME}," f"PrimaryColour={SRC_FONT_COLOR},OutlineColour={SRC_OUTLINE_COLOR},OutlineWidth={SRC_OUTLINE_WIDTH}," f"ShadowColour={SRC_SHADOW_COLOR},BorderStyle=1'," f"subtitles={TRANS_SRT}:force_style='FontSize={TRANS_FONT_SIZE},FontName={TRANS_FONT_NAME}," f"PrimaryColour={TRANS_FONT_COLOR},OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH}," f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'" ).encode('utf-8'), ] ffmpeg_gpu = load_key("ffmpeg_gpu") if ffmpeg_gpu: rprint("[bold green]will use GPU acceleration.[/bold green]") ffmpeg_cmd.extend(['-c:v', 'h264_nvenc']) ffmpeg_cmd.extend(['-y', OUTPUT_VIDEO]) rprint("🎬 Start merging subtitles to video...") start_time = time.time() process = subprocess.Popen(ffmpeg_cmd) try: process.wait() if process.returncode == 0: rprint(f"\n✅ Done! Time taken: {time.time() - start_time:.2f} seconds") else: rprint("\n❌ FFmpeg execution error") except Exception as e: rprint(f"\n❌ Error occurred: {e}") if process.poll() is None: process.kill() if __name__ == "__main__": merge_subtitles_to_video() ================================================ FILE: core/_8_1_audio_task.py ================================================ import datetime import re import pandas as pd from rich.console import Console from rich.panel import Panel from core.prompts import get_subtitle_trim_prompt from core.tts_backend.estimate_duration import init_estimator, estimate_duration from core.utils import * from core.utils.models import * console = Console() speed_factor = load_key("speed_factor") TRANS_SUBS_FOR_AUDIO_FILE = 'output/audio/trans_subs_for_audio.srt' SRC_SUBS_FOR_AUDIO_FILE = 'output/audio/src_subs_for_audio.srt' ESTIMATOR = None def check_len_then_trim(text, duration): global ESTIMATOR if ESTIMATOR is None: ESTIMATOR = init_estimator() estimated_duration = estimate_duration(text, ESTIMATOR) / speed_factor['max'] console.print(f"Subtitle text: {text}, " f"[bold green]Estimated reading duration: {estimated_duration:.2f} seconds[/bold green]") if estimated_duration > duration: rprint(Panel(f"Estimated reading duration {estimated_duration:.2f} seconds exceeds given duration {duration:.2f} seconds, shortening...", title="Processing", border_style="yellow")) original_text = text prompt = get_subtitle_trim_prompt(text, duration) def valid_trim(response): if 'result' not in response: return {'status': 'error', 'message': 'No result in response'} return {'status': 'success', 'message': ''} try: response = ask_gpt(prompt, resp_type='json', log_title='sub_trim', valid_def=valid_trim) shortened_text = response['result'] except Exception: rprint("[bold red]🚫 AI refused to answer due to sensitivity, so manually remove punctuation[/bold red]") shortened_text = re.sub(r'[,.!?;:,。!?;:]', ' ', text).strip() rprint(Panel(f"Subtitle before shortening: {original_text}\nSubtitle after shortening: {shortened_text}", title="Subtitle Shortening Result", border_style="green")) return shortened_text else: return text def time_diff_seconds(t1, t2, base_date): """Calculate the difference in seconds between two time objects""" dt1 = datetime.datetime.combine(base_date, t1) dt2 = datetime.datetime.combine(base_date, t2) return (dt2 - dt1).total_seconds() def process_srt(): """Process srt file, generate audio tasks""" with open(TRANS_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as file: content = file.read() with open(SRC_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as src_file: src_content = src_file.read() subtitles = [] src_subtitles = {} for block in src_content.strip().split('\n\n'): lines = [line.strip() for line in block.split('\n') if line.strip()] if len(lines) < 3: continue number = int(lines[0]) src_text = ' '.join(lines[2:]) src_subtitles[number] = src_text for block in content.strip().split('\n\n'): lines = [line.strip() for line in block.split('\n') if line.strip()] if len(lines) < 3: continue try: number = int(lines[0]) start_time, end_time = lines[1].split(' --> ') start_time = datetime.datetime.strptime(start_time, '%H:%M:%S,%f').time() end_time = datetime.datetime.strptime(end_time, '%H:%M:%S,%f').time() duration = time_diff_seconds(start_time, end_time, datetime.date.today()) text = ' '.join(lines[2:]) # Remove content within parentheses (including English and Chinese parentheses) text = re.sub(r'\([^)]*\)', '', text).strip() text = re.sub(r'([^)]*)', '', text).strip() # Remove '-' character, can continue to add illegal characters that cause errors text = text.replace('-', '') # Add the original text from src_subs_for_audio.srt origin = src_subtitles.get(number, '') except ValueError as e: rprint(Panel(f"Unable to parse subtitle block '{block}', error: {str(e)}, skipping this subtitle block.", title="Error", border_style="red")) continue subtitles.append({'number': number, 'start_time': start_time, 'end_time': end_time, 'duration': duration, 'text': text, 'origin': origin}) df = pd.DataFrame(subtitles) i = 0 MIN_SUB_DUR = load_key("min_subtitle_duration") while i < len(df): today = datetime.date.today() if df.loc[i, 'duration'] < MIN_SUB_DUR: if i < len(df) - 1 and time_diff_seconds(df.loc[i, 'start_time'],df.loc[i+1, 'start_time'],today) < MIN_SUB_DUR: rprint(f"[bold yellow]Merging subtitles {i+1} and {i+2}[/bold yellow]") df.loc[i, 'text'] += ' ' + df.loc[i+1, 'text'] df.loc[i, 'origin'] += ' ' + df.loc[i+1, 'origin'] df.loc[i, 'end_time'] = df.loc[i+1, 'end_time'] df.loc[i, 'duration'] = time_diff_seconds(df.loc[i, 'start_time'],df.loc[i, 'end_time'],today) df = df.drop(i+1).reset_index(drop=True) else: if i < len(df) - 1: # Not the last audio rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUB_DUR} seconds[/bold blue]") df.loc[i, 'end_time'] = (datetime.datetime.combine(today, df.loc[i, 'start_time']) + datetime.timedelta(seconds=MIN_SUB_DUR)).time() df.loc[i, 'duration'] = MIN_SUB_DUR else: rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUB_DUR} seconds, but not extending[/bold red]") i += 1 else: i += 1 df['start_time'] = df['start_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3]) df['end_time'] = df['end_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3]) ##! No longer perform secondary trim # check and trim subtitle length, for twice to ensure the subtitle length is within the limit, 允许tolerance # df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']+x['tolerance']), axis=1) return df @check_file_exists(_8_1_AUDIO_TASK) def gen_audio_task_main(): df = process_srt() console.print(df) df.to_excel(_8_1_AUDIO_TASK, index=False) rprint(Panel(f"Successfully generated {_8_1_AUDIO_TASK}", title="Success", border_style="green")) if __name__ == '__main__': gen_audio_task_main() ================================================ FILE: core/_8_2_dub_chunks.py ================================================ import datetime import re import pandas as pd from core._8_1_audio_task import time_diff_seconds from core.asr_backend.audio_preprocess import get_audio_duration from core.tts_backend.estimate_duration import init_estimator, estimate_duration from core.utils import * from core.utils.models import * SRC_SRT = "output/src.srt" TRANS_SRT = "output/trans.srt" MAX_MERGE_COUNT = 5 ESTIMATOR = None def calc_if_too_fast(est_dur, tol_dur, duration, tolerance): accept = load_key("speed_factor.accept") # Maximum acceptable speed factor if est_dur / accept > tol_dur: # Even max speed factor cannot adapt return 2 elif est_dur > tol_dur: # Speed adjustment needed within acceptable range return 1 elif est_dur < duration - tolerance: # Speaking speed too slow return -1 else: # Normal speaking speed return 0 def merge_rows(df, start_idx, merge_count): """Merge multiple rows and calculate cumulative values""" merged = { 'est_dur': df.iloc[start_idx]['est_dur'], 'tol_dur': df.iloc[start_idx]['tol_dur'], 'duration': df.iloc[start_idx]['duration'] } while merge_count < MAX_MERGE_COUNT and (start_idx + merge_count) < len(df): next_row = df.iloc[start_idx + merge_count] merged['est_dur'] += next_row['est_dur'] merged['tol_dur'] += next_row['tol_dur'] merged['duration'] += next_row['duration'] speed_flag = calc_if_too_fast( merged['est_dur'], merged['tol_dur'], merged['duration'], df.iloc[start_idx + merge_count]['tolerance'] ) if speed_flag <= 0 or merge_count == 2: df.at[start_idx + merge_count, 'cut_off'] = 1 return merge_count + 1 merge_count += 1 # If no suitable merge point is found if merge_count >= MAX_MERGE_COUNT or (start_idx + merge_count) >= len(df): df.at[start_idx + merge_count - 1, 'cut_off'] = 1 return merge_count def analyze_subtitle_timing_and_speed(df): rprint("[🔍 Analyzing] Calculating subtitle timing and speed...") global ESTIMATOR if ESTIMATOR is None: ESTIMATOR = init_estimator() TOLERANCE = load_key("tolerance") whole_dur = get_audio_duration(_RAW_AUDIO_FILE) df['gap'] = 0.0 # Initialize gap column for i in range(len(df) - 1): current_end = datetime.datetime.strptime(df.loc[i, 'end_time'], '%H:%M:%S.%f').time() next_start = datetime.datetime.strptime(df.loc[i + 1, 'start_time'], '%H:%M:%S.%f').time() df.loc[i, 'gap'] = time_diff_seconds(current_end, next_start, datetime.date.today()) # Set the gap for the last line last_end = datetime.datetime.strptime(df.iloc[-1]['end_time'], '%H:%M:%S.%f').time() last_end_seconds = (last_end.hour * 3600 + last_end.minute * 60 + last_end.second + last_end.microsecond / 1000000) df.iloc[-1, df.columns.get_loc('gap')] = whole_dur - last_end_seconds df['tolerance'] = df['gap'].apply(lambda x: TOLERANCE if x > TOLERANCE else x) df['tol_dur'] = df['duration'] + df['tolerance'] df['est_dur'] = df.apply(lambda x: estimate_duration(x['text'], ESTIMATOR), axis=1) ## Calculate speed indicators accept = load_key("speed_factor.accept") # Maximum acceptable speed factor def calc_if_too_fast(row): est_dur = row['est_dur'] tol_dur = row['tol_dur'] duration = row['duration'] tolerance = row['tolerance'] if est_dur / accept > tol_dur: # Even max speed factor cannot adapt return 2 elif est_dur > tol_dur: # Speed adjustment needed within acceptable range return 1 elif est_dur < duration - tolerance: # Speaking speed too slow return -1 else: # Normal speaking speed return 0 df['if_too_fast'] = df.apply(calc_if_too_fast, axis=1) return df def process_cutoffs(df): rprint("[✂️ Processing] Generating cutoff points...") df['cut_off'] = 0 # Initialize cut_off column df.loc[df['gap'] >= load_key("tolerance"), 'cut_off'] = 1 # Set to 1 when gap is greater than TOLERANCE idx = 0 while idx < len(df): # Process marked split points if df.iloc[idx]['cut_off'] == 1: if df.iloc[idx]['if_too_fast'] == 2: rprint(f"[⚠️ Warning] Line {idx} is too fast and cannot be fixed by speed adjustment") idx += 1 continue # Process the last line if idx + 1 >= len(df): df.at[idx, 'cut_off'] = 1 break # Process normal or slow lines if df.iloc[idx]['if_too_fast'] <= 0: if df.iloc[idx + 1]['if_too_fast'] <= 0: df.at[idx, 'cut_off'] = 1 idx += 1 else: idx += merge_rows(df, idx, 1) # Process fast lines else: idx += merge_rows(df, idx, 1) return df def gen_dub_chunks(): rprint("[🎬 Starting] Generating dubbing chunks...") df = pd.read_excel(_8_1_AUDIO_TASK) rprint("[📊 Processing] Analyzing timing and speed...") df = analyze_subtitle_timing_and_speed(df) rprint("[✂️ Processing] Processing cutoffs...") df = process_cutoffs(df) rprint("[📝 Reading] Loading transcript files...") content = open(TRANS_SRT, "r", encoding="utf-8").read() ori_content = open(SRC_SRT, "r", encoding="utf-8").read() # Process subtitle content content_lines = [] ori_content_lines = [] # Process translated subtitles for block in content.strip().split('\n\n'): lines = [line.strip() for line in block.split('\n') if line.strip()] if len(lines) >= 3: text = ' '.join(lines[2:]) text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '') content_lines.append(text) # Process source subtitles (same structure) for block in ori_content.strip().split('\n\n'): lines = [line.strip() for line in block.split('\n') if line.strip()] if len(lines) >= 3: text = ' '.join(lines[2:]) text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '') ori_content_lines.append(text) # Match processing df['lines'] = None df['src_lines'] = None last_idx = 0 def clean_text(text): """clean space and punctuation""" if not text or not isinstance(text, str): return '' return re.sub(r'[^\w\s]|[\s]', '', text) for idx, row in df.iterrows(): target = clean_text(row['text']) matches = [] current = '' match_indices = [] # Store indices for matching lines for i in range(last_idx, len(content_lines)): line = content_lines[i] cleaned_line = clean_text(line) current += cleaned_line matches.append(line) # 存储原始文本 match_indices.append(i) if current == target: df.at[idx, 'lines'] = matches df.at[idx, 'src_lines'] = [ori_content_lines[i] for i in match_indices] last_idx = i + 1 break else: # If no match is found rprint(f"[❌ Error] Matching failed at line {idx}:") rprint(f"Target: '{target}'") rprint(f"Current: '{current}'") raise ValueError("Matching failed") # Save results df.to_excel(_8_1_AUDIO_TASK, index=False) rprint("[✅ Complete] Matching completed successfully!") if __name__ == "__main__": gen_dub_chunks() ================================================ FILE: core/_9_refer_audio.py ================================================ import os from rich.panel import Panel from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn from core.utils import * from core.utils.models import * import pandas as pd import soundfile as sf console = Console() from core.asr_backend.demucs_vl import demucs_audio from core.utils.models import * def time_to_samples(time_str, sr): """Unified time conversion function""" h, m, s = time_str.split(':') s, ms = s.split(',') if ',' in s else (s, '0') seconds = int(h) * 3600 + int(m) * 60 + float(s) + float(ms) / 1000 return int(seconds * sr) def extract_audio(audio_data, sr, start_time, end_time, out_file): """Simplified audio extraction function""" start = time_to_samples(start_time, sr) end = time_to_samples(end_time, sr) sf.write(out_file, audio_data[start:end], sr) def extract_refer_audio_main(): demucs_audio() #!!! in case demucs not run if os.path.exists(os.path.join(_AUDIO_SEGS_DIR, '1.wav')): rprint(Panel("Audio segments already exist, skipping extraction", title="Info", border_style="blue")) return # Create output directory os.makedirs(_AUDIO_REFERS_DIR, exist_ok=True) # Read task file and audio data df = pd.read_excel(_8_1_AUDIO_TASK) data, sr = sf.read(_VOCAL_AUDIO_FILE) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), ) as progress: task = progress.add_task("Extracting audio segments...", total=len(df)) for _, row in df.iterrows(): out_file = os.path.join(_AUDIO_REFERS_DIR, f"{row['number']}.wav") extract_audio(data, sr, row['start_time'], row['end_time'], out_file) progress.update(task, advance=1) rprint(Panel(f"Audio segments saved to {_AUDIO_REFERS_DIR}", title="Success", border_style="green")) if __name__ == "__main__": extract_refer_audio_main() ================================================ FILE: core/__init__.py ================================================ # use try-except to avoid error when installing try: from . import ( _1_ytdlp, _2_asr, _3_1_split_nlp, _3_2_split_meaning, _4_1_summarize, _4_2_translate, _5_split_sub, _6_gen_sub, _7_sub_into_vid, _8_1_audio_task, _8_2_dub_chunks, _9_refer_audio, _10_gen_audio, _11_merge_audio, _12_dub_to_vid ) from .utils import * from .utils.onekeycleanup import cleanup from .utils.delete_retry_dubbing import delete_dubbing_files except ImportError: pass __all__ = [ 'ask_gpt', 'load_key', 'update_key', 'cleanup', 'delete_dubbing_files', '_1_ytdlp', '_2_asr', '_3_1_split_nlp', '_3_2_split_meaning', '_4_1_summarize', '_4_2_translate', '_5_split_sub', '_6_gen_sub', '_7_sub_into_vid', '_8_1_audio_task', '_8_2_dub_chunks', '_9_refer_audio', '_10_gen_audio', '_11_merge_audio', '_12_dub_to_vid' ] ================================================ FILE: core/asr_backend/__init__.py ================================================ ================================================ FILE: core/asr_backend/audio_preprocess.py ================================================ import os, subprocess import pandas as pd from typing import Dict, List, Tuple from pydub import AudioSegment from core.utils import * from core.utils.models import * from pydub import AudioSegment from pydub.silence import detect_silence from pydub.utils import mediainfo from rich import print as rprint def _ffmpeg_has_encoder(encoder_name: str) -> bool: """Check if the current ffmpeg installation supports a given audio encoder.""" try: result = subprocess.run( ['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10 ) return encoder_name in result.stdout except Exception: return False def normalize_audio_volume(audio_path, output_path, target_db = -20.0, format = "wav"): audio = AudioSegment.from_file(audio_path) change_in_dBFS = target_db - audio.dBFS normalized_audio = audio.apply_gain(change_in_dBFS) normalized_audio.export(output_path, format=format) rprint(f"[green]✅ Audio normalized from {audio.dBFS:.1f}dB to {target_db:.1f}dB[/green]") return output_path def convert_video_to_audio(video_file: str): os.makedirs(_AUDIO_DIR, exist_ok=True) if not os.path.exists(_RAW_AUDIO_FILE): rprint(f"[blue]🎬➡️🎵 Converting to high quality audio with FFmpeg ......[/blue]") if _ffmpeg_has_encoder('libmp3lame'): cmd = [ 'ffmpeg', '-y', '-i', video_file, '-vn', '-c:a', 'libmp3lame', '-b:a', '32k', '-ar', '16000', '-ac', '1', '-metadata', 'encoding=UTF-8', _RAW_AUDIO_FILE ] else: # Fallback: conda-forge ffmpeg often lacks libmp3lame. # Output as WAV (PCM) which all ffmpeg builds support. # Downstream readers (pydub, librosa, whisperX) detect format by # file header, not extension, so .mp3 path with WAV content works. rprint("[yellow]⚠️ libmp3lame not found in ffmpeg, falling back to WAV (PCM) encoding[/yellow]") cmd = [ 'ffmpeg', '-y', '-i', video_file, '-vn', '-c:a', 'pcm_s16le', '-ar', '16000', '-ac', '1', '-f', 'wav', _RAW_AUDIO_FILE ] subprocess.run(cmd, check=True, stderr=subprocess.PIPE) rprint(f"[green]🎬➡️🎵 Converted <{video_file}> to <{_RAW_AUDIO_FILE}> with FFmpeg\n[/green]") def get_audio_duration(audio_file: str) -> float: """Get the duration of an audio file using ffmpeg.""" cmd = ['ffmpeg', '-i', audio_file] process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, stderr = process.communicate() output = stderr.decode('utf-8', errors='ignore') try: duration_str = [line for line in output.split('\n') if 'Duration' in line][0] duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':') duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2]) except Exception as e: print(f"[red]❌ Error: Failed to get audio duration: {e}[/red]") duration = 0 return duration def split_audio(audio_file: str, target_len: float = 30*60, win: float = 60) -> List[Tuple[float, float]]: ## 在 [target_len-win, target_len+win] 区间内用 pydub 检测静默,切分音频 rprint(f"[blue]🎙️ Starting audio segmentation {audio_file} {target_len} {win}[/blue]") audio = AudioSegment.from_file(audio_file) duration = float(mediainfo(audio_file)["duration"]) if duration <= target_len + win: return [(0, duration)] segments, pos = [], 0.0 safe_margin = 0.5 # 静默点前后安全边界,单位秒 while pos < duration: if duration - pos <= target_len: segments.append((pos, duration)); break threshold = pos + target_len ws, we = int((threshold - win) * 1000), int((threshold + win) * 1000) # 获取完整的静默区域 silence_regions = detect_silence(audio[ws:we], min_silence_len=int(safe_margin*1000), silence_thresh=-30) silence_regions = [(s/1000 + (threshold - win), e/1000 + (threshold - win)) for s, e in silence_regions] # 筛选长度足够(至少1秒)且位置适合的静默区域 valid_regions = [ (start, end) for start, end in silence_regions if (end - start) >= (safe_margin * 2) and threshold <= start + safe_margin <= threshold + win ] if valid_regions: start, end = valid_regions[0] split_at = start + safe_margin # 在静默区域起始点后0.5秒处切分 else: rprint(f"[yellow]⚠️ No valid silence regions found for {audio_file} at {threshold}s, using threshold[/yellow]") split_at = threshold segments.append((pos, split_at)); pos = split_at rprint(f"[green]🎙️ Audio split completed {len(segments)} segments[/green]") return segments def process_transcription(result: Dict) -> pd.DataFrame: all_words = [] for segment in result['segments']: # Get speaker_id, if not exists, set to None speaker_id = segment.get('speaker_id', None) for word in segment['words']: # Check word length if len(word["word"]) > 30: rprint(f"[yellow]⚠️ Warning: Detected word longer than 30 characters, skipping: {word['word']}[/yellow]") continue # ! For French, we need to convert guillemets to empty strings word["word"] = word["word"].replace('»', '').replace('«', '') if 'start' not in word and 'end' not in word: if all_words: # Assign the end time of the previous word as the start and end time of the current word word_dict = { 'text': word["word"], 'start': all_words[-1]['end'], 'end': all_words[-1]['end'], 'speaker_id': speaker_id } all_words.append(word_dict) else: # If it's the first word, look next for a timestamp then assign it to the current word next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None) if next_word: word_dict = { 'text': word["word"], 'start': next_word["start"], 'end': next_word["end"], 'speaker_id': speaker_id } all_words.append(word_dict) else: raise Exception(f"No next word with timestamp found for the current word : {word}") else: # Normal case, with start and end times word_dict = { 'text': f'{word["word"]}', 'start': word.get('start', all_words[-1]['end'] if all_words else 0), 'end': word['end'], 'speaker_id': speaker_id } all_words.append(word_dict) return pd.DataFrame(all_words) def save_results(df: pd.DataFrame): os.makedirs('output/log', exist_ok=True) # Remove rows where 'text' is empty initial_rows = len(df) df = df[df['text'].str.len() > 0] removed_rows = initial_rows - len(df) if removed_rows > 0: rprint(f"[blue]ℹ️ Removed {removed_rows} row(s) with empty text.[/blue]") # Check for and remove words longer than 20 characters long_words = df[df['text'].str.len() > 30] if not long_words.empty: rprint(f"[yellow]⚠️ Warning: Detected {len(long_words)} word(s) longer than 30 characters. These will be removed.[/yellow]") df = df[df['text'].str.len() <= 30] df['text'] = df['text'].apply(lambda x: f'"{x}"') df.to_excel(_2_CLEANED_CHUNKS, index=False) rprint(f"[green]📊 Excel file saved to {_2_CLEANED_CHUNKS}[/green]") def save_language(language: str): update_key("whisper.detected_language", language) ================================================ FILE: core/asr_backend/demucs_vl.py ================================================ import os import torch from rich.console import Console from rich import print as rprint from demucs.pretrained import get_model from demucs.audio import save_audio from torch.cuda import is_available as is_cuda_available from typing import Optional from demucs.api import Separator from demucs.apply import BagOfModels import gc from core.utils.models import * class PreloadedSeparator(Separator): def __init__(self, model: BagOfModels, shifts: int = 1, overlap: float = 0.25, split: bool = True, segment: Optional[int] = None, jobs: int = 0): self._model, self._audio_channels, self._samplerate = model, model.audio_channels, model.samplerate device = "cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu" self.update_parameter(device=device, shifts=shifts, overlap=overlap, split=split, segment=segment, jobs=jobs, progress=True, callback=None, callback_arg=None) def demucs_audio(): if os.path.exists(_VOCAL_AUDIO_FILE) and os.path.exists(_BACKGROUND_AUDIO_FILE): rprint(f"[yellow]⚠️ {_VOCAL_AUDIO_FILE} and {_BACKGROUND_AUDIO_FILE} already exist, skip Demucs processing.[/yellow]") return console = Console() os.makedirs(_AUDIO_DIR, exist_ok=True) console.print("🤖 Loading model...") model = get_model('htdemucs') separator = PreloadedSeparator(model=model, shifts=1, overlap=0.25) console.print("🎵 Separating audio...") _, outputs = separator.separate_audio_file(_RAW_AUDIO_FILE) kwargs = {"samplerate": model.samplerate, "bitrate": 128, "preset": 2, "clip": "rescale", "as_float": False, "bits_per_sample": 16} console.print("🎤 Saving vocals track...") save_audio(outputs['vocals'].cpu(), _VOCAL_AUDIO_FILE, **kwargs) console.print("🎹 Saving background music...") background = sum(audio for source, audio in outputs.items() if source != 'vocals') save_audio(background.cpu(), _BACKGROUND_AUDIO_FILE, **kwargs) # Clean up memory del outputs, background, model, separator gc.collect() console.print("[green]✨ Audio separation completed![/green]") if __name__ == "__main__": demucs_audio() ================================================ FILE: core/asr_backend/elevenlabs_asr.py ================================================ import os import json import time import requests import tempfile import librosa import soundfile as sf from rich import print as rprint from core.utils import * # ---------------------------------------- # ISO 639-2 to 1 # ---------------------------------------- iso_639_2_to_1 = { "eng": "en", "fra": "fr", "deu": "de", "ita": "it", "spa": "es", "rus": "ru", "kor": "ko", "jpn": "ja", "zho": "zh", "yue": "zh" } # ---------------------------- # elevenlabs format to whisper format # ---------------------------- SPLIT_GAP = 1 def elev2whisper(elev_json, word_level_timestamp = False): words = elev_json.get("words", []) if not words: return {"segments": []} segments, seg = [], { "text": "", # accumulated text "start": words[0]["start"], # seg start time "end": words[0]["end"], # seg end time (updates) "speaker_id": words[0]["speaker_id"], "words": [] # optional per‑word info } for prev, nxt in zip(words, words[1:] + [None]): # pairwise with sentinel seg["text"] += prev["text"] seg["end"] = prev["end"] if word_level_timestamp: seg["words"].append({"text": prev["text"], "start": prev["start"], "end": prev["end"]}) # decide whether to break the segment if nxt is None or (nxt["start"] - prev["end"] > SPLIT_GAP) or (nxt["speaker_id"] != seg["speaker_id"]): seg["text"] = seg["text"].strip() if not word_level_timestamp: seg.pop("words") segments.append(seg) if nxt is not None: # seed next segment seg = { "text": "", "start": nxt["start"], "end": nxt["end"], "speaker_id": nxt["speaker_id"], "words": [] } return {"segments": segments} def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, end = None): rprint(f"[cyan]🎤 Processing audio transcription, file path: {vocal_audio_path}[/cyan]") LOG_FILE = f"output/log/elevenlabs_transcribe_{start}_{end}.json" if os.path.exists(LOG_FILE): with open(LOG_FILE, "r", encoding="utf-8") as f: return json.load(f) # Load audio and process start/end parameters y, sr = librosa.load(vocal_audio_path, sr=16000) audio_duration = len(y) / sr if start is None or end is None: start = 0 end = audio_duration # Slice audio based on start/end start_sample = int(start * sr) end_sample = int(end * sr) y_slice = y[start_sample:end_sample] # Create temporary file for the sliced audio with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file: temp_filepath = temp_file.name sf.write(temp_filepath, y_slice, sr, format='MP3') try: api_key = load_key("whisper.elevenlabs_api_key") base_url = "https://api.elevenlabs.io/v1/speech-to-text" headers = {"xi-api-key": api_key} data = { "model_id": "scribe_v1", "timestamps_granularity": "word", "language_code": load_key("whisper.language"), "diarize": True, "num_speakers": None, "tag_audio_events": False } with open(temp_filepath, 'rb') as audio_file: files = {"file": (os.path.basename(temp_filepath), audio_file, 'audio/mpeg')} start_time = time.time() response = requests.post(base_url, headers=headers, data=data, files=files) rprint(f"[yellow]API request sent, status code: {response.status_code}[/yellow]") result = response.json() # save detected language detected_language = iso_639_2_to_1.get(result["language_code"], result["language_code"]) update_key("whisper.detected_language", detected_language) # Adjust timestamps for all words by adding the start time if start is not None and 'words' in result: for word in result['words']: if 'start' in word: word['start'] += start if 'end' in word: word['end'] += start rprint(f"[green]✓ Transcription completed in {time.time() - start_time:.2f} seconds[/green]") parsed_result = elev2whisper(result) os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) with open(LOG_FILE, "w", encoding="utf-8") as f: json.dump(parsed_result, f, indent=4, ensure_ascii=False) return parsed_result finally: # Clean up the temporary file if os.path.exists(temp_filepath): os.remove(temp_filepath) if __name__ == "__main__": file_path = input("Enter local audio file path (mp3 format): ") language = input("Enter language code for transcription (en or zh or other...): ") result = transcribe_audio_elevenlabs(file_path, language_code=language) print(result) # Save result to file with open("output/transcript.json", "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=4) ================================================ FILE: core/asr_backend/whisperX_302.py ================================================ import os import io import json import time import requests import librosa import soundfile as sf from rich import print as rprint from core.utils import * from core.utils.models import * OUTPUT_LOG_DIR = "output/log" def transcribe_audio_302(raw_audio_path: str, vocal_audio_path: str, start: float = None, end: float = None): os.makedirs(OUTPUT_LOG_DIR, exist_ok=True) LOG_FILE = f"{OUTPUT_LOG_DIR}/whisperx302_{start}_{end}.json" if os.path.exists(LOG_FILE): with open(LOG_FILE, "r", encoding="utf-8") as f: return json.load(f) WHISPER_LANGUAGE = load_key("whisper.language") update_key("whisper.language", WHISPER_LANGUAGE) url = "https://api.302.ai/302/whisperx" y, sr = librosa.load(vocal_audio_path, sr=16000) audio_duration = len(y) / sr if start is None or end is None: start = 0 end = audio_duration start_sample = int(start * sr) end_sample = int(end * sr) y_slice = y[start_sample:end_sample] audio_buffer = io.BytesIO() sf.write(audio_buffer, y_slice, sr, format='WAV', subtype='PCM_16') audio_buffer.seek(0) files = [('audio_input', ('audio_slice.wav', audio_buffer, 'application/octet-stream'))] payload = {"processing_type": "align", "language": WHISPER_LANGUAGE, "output": "raw"} start_time = time.time() rprint(f"[cyan]🎤 Transcribing audio with language: <{WHISPER_LANGUAGE}> ...[/cyan]") headers = {'Authorization': f'Bearer {load_key("whisper.whisperX_302_api_key")}'} response = requests.request("POST", url, headers=headers, data=payload, files=files) response_json = response.json() if start is not None: for segment in response_json['segments']: segment['start'] += start segment['end'] += start for word in segment.get('words', []): if 'start' in word: word['start'] += start if 'end' in word: word['end'] += start with open(LOG_FILE, "w", encoding="utf-8") as f: json.dump(response_json, f, indent=4, ensure_ascii=False) elapsed_time = time.time() - start_time rprint(f"[green]✓ Transcription completed in {elapsed_time:.2f} seconds[/green]") return response_json if __name__ == "__main__": result = transcribe_audio_302(_RAW_AUDIO_FILE, _RAW_AUDIO_FILE) rprint(result) ================================================ FILE: core/asr_backend/whisperX_local.py ================================================ import os import warnings import time import subprocess import torch import functools warnings.filterwarnings("ignore") # ============================================================================= # Compatibility shim — applied BEFORE importing whisperx # ============================================================================= # torch.load: default weights_only=False for pyannote checkpoints # PyTorch >=2.6 changed torch.load default to weights_only=True. # pyannote checkpoints contain omegaconf objects that fail the safety check. # Monkey-patch torch.load to default to weights_only=False (matching <2.6 # behavior). This is safe here because all model files come from trusted # sources (HuggingFace / pyannote). _original_torch_load = torch.load @functools.wraps(_original_torch_load) def _patched_torch_load(*args, **kwargs): if kwargs.get("weights_only") is None: kwargs["weights_only"] = False return _original_torch_load(*args, **kwargs) torch.load = _patched_torch_load # ============================================================================= # Now safe to import whisperx and the rest of the application # ============================================================================= import whisperx from whisperx.audio import load_audio as _whisperx_load_audio, SAMPLE_RATE as _WHISPERX_SR from rich import print as rprint from core.utils import * MODEL_DIR = load_key("model_dir") @except_handler("failed to check hf mirror", default_return=None) def check_hf_mirror(): mirrors = {'Official': 'huggingface.co', 'Mirror': 'hf-mirror.com'} fastest_url = f"https://{mirrors['Official']}" best_time = float('inf') rprint("[cyan]🔍 Checking HuggingFace mirrors...[/cyan]") for name, domain in mirrors.items(): if os.name == 'nt': cmd = ['ping', '-n', '1', '-w', '3000', domain] else: cmd = ['ping', '-c', '1', '-W', '3', domain] start = time.time() result = subprocess.run(cmd, capture_output=True, text=True) response_time = time.time() - start if result.returncode == 0: if response_time < best_time: best_time = response_time fastest_url = f"https://{domain}" rprint(f"[green]✓ {name}:[/green] {response_time:.2f}s") if best_time == float('inf'): rprint("[yellow]⚠️ All mirrors failed, using default[/yellow]") rprint(f"[cyan]🚀 Selected mirror:[/cyan] {fastest_url} ({best_time:.2f}s)") return fastest_url @except_handler("WhisperX processing error:") def transcribe_audio(raw_audio_file, vocal_audio_file, start, end): os.environ['HF_ENDPOINT'] = check_hf_mirror() WHISPER_LANGUAGE = load_key("whisper.language") device = "cuda" if torch.cuda.is_available() else "cpu" rprint(f"🚀 Starting WhisperX using device: {device} ...") if device == "cuda": gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3) batch_size = 16 if gpu_mem > 8 else 2 compute_type = "float16" if torch.cuda.is_bf16_supported() else "int8" rprint(f"[cyan]🎮 GPU memory:[/cyan] {gpu_mem:.2f} GB, [cyan]📦 Batch size:[/cyan] {batch_size}, [cyan]⚙️ Compute type:[/cyan] {compute_type}") else: batch_size = 1 compute_type = "int8" rprint(f"[cyan]📦 Batch size:[/cyan] {batch_size}, [cyan]⚙️ Compute type:[/cyan] {compute_type}") rprint(f"[green]▶️ Starting WhisperX for segment {start:.2f}s to {end:.2f}s...[/green]") if WHISPER_LANGUAGE == 'zh': model_name = "Huan69/Belle-whisper-large-v3-zh-punct-fasterwhisper" local_model = os.path.join(MODEL_DIR, "Belle-whisper-large-v3-zh-punct-fasterwhisper") else: model_name = load_key("whisper.model") local_model = os.path.join(MODEL_DIR, model_name) if os.path.exists(local_model): rprint(f"[green]📥 Loading local WHISPER model:[/green] {local_model} ...") model_name = local_model else: rprint(f"[green]📥 Using WHISPER model from HuggingFace:[/green] {model_name} ...") vad_options = {"vad_onset": 0.500,"vad_offset": 0.363} asr_options = {"temperatures": [0],"initial_prompt": "",} whisper_language = None if 'auto' in WHISPER_LANGUAGE else WHISPER_LANGUAGE rprint("[bold yellow] You can ignore warning of `Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118...`[/bold yellow]") model = whisperx.load_model(model_name, device, compute_type=compute_type, language=whisper_language, vad_options=vad_options, asr_options=asr_options, download_root=MODEL_DIR) def load_audio_segment(audio_file, start, end): # Use whisperx's ffmpeg-based loader instead of librosa.load() which # deadlocks inside Streamlit's ScriptRunner thread. full_audio = _whisperx_load_audio(audio_file, sr=_WHISPERX_SR) start_sample = int(start * _WHISPERX_SR) end_sample = int(end * _WHISPERX_SR) return full_audio[start_sample:end_sample] raw_audio_segment = load_audio_segment(raw_audio_file, start, end) vocal_audio_segment = load_audio_segment(vocal_audio_file, start, end) # ------------------------- # 1. transcribe raw audio # ------------------------- transcribe_start_time = time.time() rprint("[bold green]Note: You will see Progress if working correctly ↓[/bold green]") result = model.transcribe(raw_audio_segment, batch_size=batch_size, print_progress=True) transcribe_time = time.time() - transcribe_start_time rprint(f"[cyan]⏱️ time transcribe:[/cyan] {transcribe_time:.2f}s") # Free GPU resources del model torch.cuda.empty_cache() # Save language update_key("whisper.language", result['language']) if result['language'] == 'zh' and WHISPER_LANGUAGE != 'zh': raise ValueError("Please specify the transcription language as zh and try again!") # ------------------------- # 2. align by vocal audio # ------------------------- align_start_time = time.time() # Align timestamps using vocal audio model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) result = whisperx.align(result["segments"], model_a, metadata, vocal_audio_segment, device, return_char_alignments=False) align_time = time.time() - align_start_time rprint(f"[cyan]⏱️ time align:[/cyan] {align_time:.2f}s") # Free GPU resources again torch.cuda.empty_cache() del model_a # Adjust timestamps for segment in result['segments']: segment['start'] += start segment['end'] += start for word in segment['words']: if 'start' in word: word['start'] += start if 'end' in word: word['end'] += start return result ================================================ FILE: core/prompts.py ================================================ import json from core.utils import * ## ================================================================ # @ step4_splitbymeaning.py def get_split_prompt(sentence, num_parts = 2, word_limit = 20): language = load_key("whisper.detected_language") split_prompt = f""" ## Role You are a professional Netflix subtitle splitter in **{language}**. ## Task Split the given subtitle text into **{num_parts}** parts, each less than **{word_limit}** words. 1. Maintain sentence meaning coherence according to Netflix subtitle standards 2. MOST IMPORTANT: Keep parts roughly equal in length (minimum 3 words each) 3. Split at natural points like punctuation marks or conjunctions 4. If provided text is repeated words, simply split at the middle of the repeated words. ## Steps 1. Analyze the sentence structure, complexity, and key splitting challenges 2. Generate two alternative splitting approaches with [br] tags at split positions 3. Compare both approaches highlighting their strengths and weaknesses 4. Choose the best splitting approach ## Given Text {sentence} ## Output in only JSON format and no other text ```json {{ "analysis": "Brief description of sentence structure, complexity, and key splitting challenges", "split1": "First splitting approach with [br] tags at split positions", "split2": "Alternative splitting approach with [br] tags at split positions", "assess": "Comparison of both approaches highlighting their strengths and weaknesses", "choice": "1 or 2" }} ``` Note: Start you answer with ```json and end with ```, do not add any other text. """.strip() return split_prompt """{{ "analysis": "Brief analysis of the text structure", "split": "Complete sentence with [br] tags at split positions" }}""" ## ================================================================ # @ step4_1_summarize.py def get_summary_prompt(source_content, custom_terms_json=None): src_lang = load_key("whisper.detected_language") tgt_lang = load_key("target_language") # add custom terms note terms_note = "" if custom_terms_json: terms_list = [] for term in custom_terms_json['terms']: terms_list.append(f"- {term['src']}: {term['tgt']} ({term['note']})") terms_note = "\n### Existing Terms\nPlease exclude these terms in your extraction:\n" + "\n".join(terms_list) summary_prompt = f""" ## Role You are a video translation expert and terminology consultant, specializing in {src_lang} comprehension and {tgt_lang} expression optimization. ## Task For the provided {src_lang} video text: 1. Summarize main topic in two sentences 2. Extract professional terms/names with {tgt_lang} translations (excluding existing terms) 3. Provide brief explanation for each term {terms_note} Steps: 1. Topic Summary: - Quick scan for general understanding - Write two sentences: first for main topic, second for key point 2. Term Extraction: - Mark professional terms and names (excluding those listed in Existing Terms) - Provide {tgt_lang} translation or keep original - Add brief explanation - Extract less than 15 terms ## INPUT {source_content} ## Output in only JSON format and no other text {{ "theme": "Two-sentence video summary", "terms": [ {{ "src": "{src_lang} term", "tgt": "{tgt_lang} translation or original", "note": "Brief explanation" }}, ... ] }} ## Example {{ "theme": "本视频介绍人工智能在医疗领域的应用现状。重点展示了AI在医学影像诊断和药物研发中的突破性进展。", "terms": [ {{ "src": "Machine Learning", "tgt": "机器学习", "note": "AI的核心技术,通过数据训练实现智能决策" }}, {{ "src": "CNN", "tgt": "CNN", "note": "卷积神经网络,用于医学图像识别的深度学习模型" }} ] }} Note: Start you answer with ```json and end with ```, do not add any other text. """.strip() return summary_prompt ## ================================================================ # @ step5_translate.py & translate_lines.py def generate_shared_prompt(previous_content_prompt, after_content_prompt, summary_prompt, things_to_note_prompt): return f'''### Context Information {previous_content_prompt} {after_content_prompt} ### Content Summary {summary_prompt} ### Points to Note {things_to_note_prompt}''' def get_prompt_faithfulness(lines, shared_prompt): TARGET_LANGUAGE = load_key("target_language") # Split lines by \n line_splits = lines.split('\n') json_dict = {} for i, line in enumerate(line_splits, 1): json_dict[f"{i}"] = {"origin": line, "direct": f"direct {TARGET_LANGUAGE} translation {i}."} json_format = json.dumps(json_dict, indent=2, ensure_ascii=False) src_language = load_key("whisper.detected_language") prompt_faithfulness = f''' ## Role You are a professional Netflix subtitle translator, fluent in both {src_language} and {TARGET_LANGUAGE}, as well as their respective cultures. Your expertise lies in accurately understanding the semantics and structure of the original {src_language} text and faithfully translating it into {TARGET_LANGUAGE} while preserving the original meaning. ## Task We have a segment of original {src_language} subtitles that need to be directly translated into {TARGET_LANGUAGE}. These subtitles come from a specific context and may contain specific themes and terminology. 1. Translate the original {src_language} subtitles into {TARGET_LANGUAGE} line by line 2. Ensure the translation is faithful to the original, accurately conveying the original meaning 3. Consider the context and professional terminology {shared_prompt} 1. Faithful to the original: Accurately convey the content and meaning of the original text, without arbitrarily changing, adding, or omitting content. 2. Accurate terminology: Use professional terms correctly and maintain consistency in terminology. 3. Understand the context: Fully comprehend and reflect the background and contextual relationships of the text. ## INPUT {lines} ## Output in only JSON format and no other text ```json {json_format} ``` Note: Start you answer with ```json and end with ```, do not add any other text. ''' return prompt_faithfulness.strip() def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt): TARGET_LANGUAGE = load_key("target_language") json_format = { key: { "origin": value["origin"], "direct": value["direct"], "reflect": "your reflection on direct translation", "free": "your free translation" } for key, value in faithfulness_result.items() } json_format = json.dumps(json_format, indent=2, ensure_ascii=False) src_language = load_key("whisper.detected_language") prompt_expressiveness = f''' ## Role You are a professional Netflix subtitle translator and language consultant. Your expertise lies not only in accurately understanding the original {src_language} but also in optimizing the {TARGET_LANGUAGE} translation to better suit the target language's expression habits and cultural background. ## Task We already have a direct translation version of the original {src_language} subtitles. Your task is to reflect on and improve these direct translations to create more natural and fluent {TARGET_LANGUAGE} subtitles. 1. Analyze the direct translation results line by line, pointing out existing issues 2. Provide detailed modification suggestions 3. Perform free translation based on your analysis 4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read 5. Do not leave empty lines in the free translation, as the subtitles are for the audience to read {shared_prompt} Please use a two-step thinking process to handle the text line by line: 1. Direct Translation Reflection: - Evaluate language fluency - Check if the language style is consistent with the original text - Check the conciseness of the subtitles, point out where the translation is too wordy 2. {TARGET_LANGUAGE} Free Translation: - Aim for contextual smoothness and naturalness, conforming to {TARGET_LANGUAGE} expression habits - Ensure it's easy for {TARGET_LANGUAGE} audience to understand and accept - Adapt the language style to match the theme (e.g., use casual language for tutorials, professional terminology for technical content, formal language for documentaries) ## INPUT {lines} ## Output in only JSON format and no other text ```json {json_format} ``` Note: Start you answer with ```json and end with ```, do not add any other text. ''' return prompt_expressiveness.strip() ## ================================================================ # @ step6_splitforsub.py def get_align_prompt(src_sub, tr_sub, src_part): targ_lang = load_key("target_language") src_lang = load_key("whisper.detected_language") src_splits = src_part.split('\n') num_parts = len(src_splits) src_part = src_part.replace('\n', ' [br] ') align_parts_json = ','.join( f''' {{ "src_part_{i+1}": "{src_splits[i]}", "target_part_{i+1}": "Corresponding aligned {targ_lang} subtitle part" }}''' for i in range(num_parts) ) align_prompt = f''' ## Role You are a Netflix subtitle alignment expert fluent in both {src_lang} and {targ_lang}. ## Task We have {src_lang} and {targ_lang} original subtitles for a Netflix program, as well as a pre-processed split version of {src_lang} subtitles. Your task is to create the best splitting scheme for the {targ_lang} subtitles based on this information. 1. Analyze the word order and structural correspondence between {src_lang} and {targ_lang} subtitles 2. Split the {targ_lang} subtitles according to the pre-processed {src_lang} split version 3. Never leave empty lines. If it's difficult to split based on meaning, you may appropriately rewrite the sentences that need to be aligned 4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read ## INPUT {src_lang} Original: "{src_sub}" {targ_lang} Original: "{tr_sub}" Pre-processed {src_lang} Subtitles ([br] indicates split points): {src_part} ## Output in only JSON format and no other text ```json {{ "analysis": "Brief analysis of word order, structure, and semantic correspondence between two subtitles", "align": [ {align_parts_json} ] }} ``` Note: Start you answer with ```json and end with ```, do not add any other text. '''.strip() return align_prompt ## ================================================================ # @ step8_gen_audio_task.py @ step10_gen_audio.py def get_subtitle_trim_prompt(text, duration): rule = '''Consider a. Reducing filler words without modifying meaningful content. b. Omitting unnecessary modifiers or pronouns, for example: - "Please explain your thought process" can be shortened to "Please explain thought process" - "We need to carefully analyze this complex problem" can be shortened to "We need to analyze this problem" - "Let's discuss the various different perspectives on this topic" can be shortened to "Let's discuss different perspectives on this topic" - "Can you describe in detail your experience from yesterday" can be shortened to "Can you describe yesterday's experience" ''' trim_prompt = f''' ## Role You are a professional subtitle editor, editing and optimizing lengthy subtitles that exceed voiceover time before handing them to voice actors. Your expertise lies in cleverly shortening subtitles slightly while ensuring the original meaning and structure remain unchanged. ## INPUT Subtitle: "{text}" Duration: {duration} seconds ## Processing Rules {rule} ## Processing Steps Please follow these steps and provide the results in the JSON output: 1. Analysis: Briefly analyze the subtitle's structure, key information, and filler words that can be omitted. 2. Trimming: Based on the rules and analysis, optimize the subtitle by making it more concise according to the processing rules. ## Output in only JSON format and no other text ```json {{ "analysis": "Brief analysis of the subtitle, including structure, key information, and potential processing locations", "result": "Optimized and shortened subtitle in the original subtitle language" }} ``` Note: Start you answer with ```json and end with ```, do not add any other text. '''.strip() return trim_prompt ## ================================================================ # @ tts_main def get_correct_text_prompt(text): return f''' ## Role You are a text cleaning expert for TTS (Text-to-Speech) systems. ## Task Clean the given text by: 1. Keep only basic punctuation (.,?!) 2. Preserve the original meaning ## INPUT {text} ## Output in only JSON format and no other text ```json {{ "text": "cleaned text here" }} ``` Note: Start you answer with ```json and end with ```, do not add any other text. '''.strip() ================================================ FILE: core/spacy_utils/__init__.py ================================================ from .split_by_comma import split_by_comma_main from .split_by_connector import split_sentences_main from .split_by_mark import split_by_mark from .split_long_by_root import split_long_by_root_main from .load_nlp_model import init_nlp __all__ = [ "split_by_comma_main", "split_sentences_main", "split_by_mark", "split_long_by_root_main", "init_nlp" ] ================================================ FILE: core/spacy_utils/load_nlp_model.py ================================================ import spacy from spacy.cli import download from core.utils import rprint, load_key, except_handler SPACY_MODEL_MAP = load_key("spacy_model_map") def get_spacy_model(language: str): model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_md") if language not in SPACY_MODEL_MAP: rprint(f"[yellow]Spacy model does not support '{language}', using en_core_web_md model as fallback...[/yellow]") return model @except_handler("Failed to load NLP Spacy model") def init_nlp(): language = "en" if load_key("whisper.language") == "en" else load_key("whisper.detected_language") model = get_spacy_model(language) rprint(f"[blue]⏳ Loading NLP Spacy model: <{model}> ...[/blue]") try: nlp = spacy.load(model) except: rprint(f"[yellow]Downloading {model} model...[/yellow]") rprint("[yellow]If download failed, please check your network and try again.[/yellow]") download(model) nlp = spacy.load(model) rprint("[green]✅ NLP Spacy model loaded successfully![/green]") return nlp # -------------------- # define the intermediate files # -------------------- SPLIT_BY_COMMA_FILE = "output/log/split_by_comma.txt" SPLIT_BY_CONNECTOR_FILE = "output/log/split_by_connector.txt" SPLIT_BY_MARK_FILE = "output/log/split_by_mark.txt" ================================================ FILE: core/spacy_utils/split_by_comma.py ================================================ import itertools import os import warnings from core.utils import * from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_COMMA_FILE, SPLIT_BY_MARK_FILE warnings.filterwarnings("ignore", category=FutureWarning) def is_valid_phrase(phrase): # 🔍 Check for subject and verb has_subject = any(token.dep_ in ["nsubj", "nsubjpass"] or token.pos_ == "PRON" for token in phrase) has_verb = any((token.pos_ == "VERB" or token.pos_ == 'AUX') for token in phrase) return (has_subject and has_verb) def analyze_comma(start, doc, token): left_phrase = doc[max(start, token.i - 9):token.i] right_phrase = doc[token.i + 1:min(len(doc), token.i + 10)] suitable_for_splitting = is_valid_phrase(right_phrase) # and is_valid_phrase(left_phrase) # ! no need to chekc left phrase # 🚫 Remove punctuation and check word count left_words = [t for t in left_phrase if not t.is_punct] right_words = list(itertools.takewhile(lambda t: not t.is_punct, right_phrase)) # ! only check the first part of the right phrase if len(left_words) <= 3 or len(right_words) <= 3: suitable_for_splitting = False return suitable_for_splitting def split_by_comma(text, nlp): doc = nlp(text) sentences = [] start = 0 for i, token in enumerate(doc): if token.text == "," or token.text == ",": suitable_for_splitting = analyze_comma(start, doc, token) if suitable_for_splitting: sentences.append(doc[start:token.i].text.strip()) rprint(f"[yellow]✂️ Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}[/yellow]") start = token.i + 1 sentences.append(doc[start:].text.strip()) return sentences def split_by_comma_main(nlp): with open(SPLIT_BY_MARK_FILE, "r", encoding="utf-8") as input_file: sentences = input_file.readlines() all_split_sentences = [] for sentence in sentences: split_sentences = split_by_comma(sentence.strip(), nlp) all_split_sentences.extend(split_sentences) with open(SPLIT_BY_COMMA_FILE, "w", encoding="utf-8") as output_file: for sentence in all_split_sentences: output_file.write(sentence + "\n") # delete the original file os.remove(SPLIT_BY_MARK_FILE) rprint(f"[green]💾 Sentences split by commas saved to → `{SPLIT_BY_COMMA_FILE}`[/green]") if __name__ == "__main__": nlp = init_nlp() split_by_comma_main(nlp) # nlp = init_nlp() # test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not." # print(split_by_comma(test, nlp)) ================================================ FILE: core/spacy_utils/split_by_connector.py ================================================ import os import warnings from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_COMMA_FILE, SPLIT_BY_CONNECTOR_FILE from core.utils import rprint warnings.filterwarnings("ignore", category=FutureWarning) def analyze_connectors(doc, token): """ Analyze whether a token is a connector that should trigger a sentence split. Processing logic and order: 1. Check if the token is one of the target connectors based on the language. 2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll). 3. For all connectors, check if they function as a specific dependency of a verb or noun. 4. Default to splitting for certain connectors if no other conditions are met. 5. For coordinating conjunctions, check if they connect two independent clauses. """ lang = doc.lang_ if lang == "en": connectors = ["that", "which", "where", "when", "because", "but", "and", "or"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "zh": connectors = ["因为", "所以", "但是", "而且", "虽然", "如果", "即使", "尽管"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "ja": connectors = ["けれども", "しかし", "だから", "それで", "ので", "のに", "ため"] mark_dep = "mark" det_pron_deps = ["case"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "fr": connectors = ["que", "qui", "où", "quand", "parce que", "mais", "et", "ou"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "ru": connectors = ["что", "который", "где", "когда", "потому что", "но", "и", "или"] mark_dep = "mark" det_pron_deps = ["det"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "es": connectors = ["que", "cual", "donde", "cuando", "porque", "pero", "y", "o"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "de": connectors = ["dass", "welche", "wo", "wann", "weil", "aber", "und", "oder"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] elif lang == "it": connectors = ["che", "quale", "dove", "quando", "perché", "ma", "e", "o"] mark_dep = "mark" det_pron_deps = ["det", "pron"] verb_pos = "VERB" noun_pos = ["NOUN", "PROPN"] else: return False, False if token.text.lower() not in connectors: return False, False if lang == "en" and token.text.lower() == "that": if token.dep_ == mark_dep and token.head.pos_ == verb_pos: return True, False else: return False, False elif token.dep_ in det_pron_deps and token.head.pos_ in noun_pos: return False, False else: return True, False def split_by_connectors(text, context_words=5, nlp=None): doc = nlp(text) sentences = [doc.text] # init while True: # Handle each task with a single cut # avoiding the fragmentation of a sentence into multiple parts at the same time. split_occurred = False new_sentences = [] for sent in sentences: doc = nlp(sent) start = 0 for i, token in enumerate(doc): split_before, _ = analyze_connectors(doc, token) if i + 1 < len(doc) and doc[i + 1].text in ["'s", "'re", "'ve", "'ll", "'d"]: continue left_words = doc[max(0, token.i - context_words):token.i] right_words = doc[token.i+1:min(len(doc), token.i + context_words + 1)] left_words = [word.text for word in left_words if not word.is_punct] right_words = [word.text for word in right_words if not word.is_punct] if len(left_words) >= context_words and len(right_words) >= context_words and split_before: rprint(f"[yellow]✂️ Split before '{token.text}': {' '.join(left_words)}| {token.text} {' '.join(right_words)}[/yellow]") new_sentences.append(doc[start:token.i].text.strip()) start = token.i split_occurred = True break if start < len(doc): new_sentences.append(doc[start:].text.strip()) if not split_occurred: break sentences = new_sentences return sentences def split_sentences_main(nlp): # Read input sentences with open(SPLIT_BY_COMMA_FILE, "r", encoding="utf-8") as input_file: sentences = input_file.readlines() all_split_sentences = [] # Process each input sentence for sentence in sentences: split_sentences = split_by_connectors(sentence.strip(), nlp = nlp) all_split_sentences.extend(split_sentences) with open(SPLIT_BY_CONNECTOR_FILE, "w+", encoding="utf-8") as output_file: for sentence in all_split_sentences: output_file.write(sentence + "\n") # do not add a newline at the end of the file output_file.seek(output_file.tell() - 1, os.SEEK_SET) output_file.truncate() # delete the original file os.remove(SPLIT_BY_COMMA_FILE) rprint(f"[green]💾 Sentences split by connectors saved to → `{SPLIT_BY_CONNECTOR_FILE}`[/green]") if __name__ == "__main__": nlp = init_nlp() split_sentences_main(nlp) # nlp = init_nlp() # a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't." # print(split_by_connectors(a, nlp)) ================================================ FILE: core/spacy_utils/split_by_mark.py ================================================ import os import pandas as pd import warnings from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_MARK_FILE from core.utils.config_utils import load_key, get_joiner from rich import print as rprint warnings.filterwarnings("ignore", category=FutureWarning) def split_by_mark(nlp): whisper_language = load_key("whisper.language") language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case joiner = get_joiner(language) rprint(f"[blue]🔍 Using {language} language joiner: '{joiner}'[/blue]") chunks = pd.read_excel("output/log/cleaned_chunks.xlsx") chunks.text = chunks.text.apply(lambda x: x.strip('"').strip("")) # join with joiner input_text = joiner.join(chunks.text.to_list()) doc = nlp(input_text) assert doc.has_annotation("SENT_START") # skip - and ... sentences_by_mark = [] current_sentence = [] # iterate all sentences for sent in doc.sents: text = sent.text.strip() # check if the current sentence ends with - or ... if current_sentence and ( text.startswith('-') or text.startswith('...') or current_sentence[-1].endswith('-') or current_sentence[-1].endswith('...') ): current_sentence.append(text) else: if current_sentence: sentences_by_mark.append(' '.join(current_sentence)) current_sentence = [] current_sentence.append(text) # add the last sentence if current_sentence: sentences_by_mark.append(' '.join(current_sentence)) with open(SPLIT_BY_MARK_FILE, "w", encoding="utf-8") as output_file: for i, sentence in enumerate(sentences_by_mark): if i > 0 and sentence.strip() in [',', '.', ',', '。', '?', '!']: # ! If the current line contains only punctuation, merge it with the previous line, this happens in Chinese, Japanese, etc. output_file.seek(output_file.tell() - 1, os.SEEK_SET) # Move to the end of the previous line output_file.write(sentence) # Add the punctuation else: output_file.write(sentence + "\n") rprint(f"[green]💾 Sentences split by punctuation marks saved to → `{SPLIT_BY_MARK_FILE}`[/green]") if __name__ == "__main__": nlp = init_nlp() split_by_mark(nlp) ================================================ FILE: core/spacy_utils/split_long_by_root.py ================================================ import os import string import warnings from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_CONNECTOR_FILE from core.utils import * from core.utils.models import _3_1_SPLIT_BY_NLP warnings.filterwarnings("ignore", category=FutureWarning) def split_long_sentence(doc): tokens = [token.text for token in doc] n = len(tokens) # dynamic programming array, dp[i] represents the optimal split scheme from the start to the ith token dp = [float('inf')] * (n + 1) dp[0] = 0 # record optimal split points prev = [0] * (n + 1) for i in range(1, n + 1): for j in range(max(0, i - 100), i): # limit search range to avoid overly long sentences if i - j >= 30: # ensure sentence length is at least 30 token = doc[i-1] if j == 0 or (token.is_sent_end or token.pos_ in ['VERB', 'AUX'] or token.dep_ == 'ROOT'): if dp[j] + 1 < dp[i]: dp[i] = dp[j] + 1 prev[i] = j # rebuild sentences based on optimal split points sentences = [] i = n whisper_language = load_key("whisper.language") language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case joiner = get_joiner(language) while i > 0: j = prev[i] sentences.append(joiner.join(tokens[j:i]).strip()) i = j return sentences[::-1] # reverse list to keep original order def split_extremely_long_sentence(doc): tokens = [token.text for token in doc] n = len(tokens) num_parts = (n + 59) // 60 # round up part_length = n // num_parts sentences = [] whisper_language = load_key("whisper.language") language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case joiner = get_joiner(language) for i in range(num_parts): start = i * part_length end = start + part_length if i < num_parts - 1 else n sentence = joiner.join(tokens[start:end]) sentences.append(sentence) return sentences def split_long_by_root_main(nlp): with open(SPLIT_BY_CONNECTOR_FILE, "r", encoding="utf-8") as input_file: sentences = input_file.readlines() all_split_sentences = [] for sentence in sentences: doc = nlp(sentence.strip()) if len(doc) > 60: split_sentences = split_long_sentence(doc) if any(len(nlp(sent)) > 60 for sent in split_sentences): split_sentences = [subsent for sent in split_sentences for subsent in split_extremely_long_sentence(nlp(sent))] all_split_sentences.extend(split_sentences) rprint(f"[yellow]✂️ Splitting long sentences by root: {sentence[:30]}...[/yellow]") else: all_split_sentences.append(sentence.strip()) punctuation = string.punctuation + "'" + '"' # include all punctuation and apostrophe ' and " with open(_3_1_SPLIT_BY_NLP, "w", encoding="utf-8") as output_file: for i, sentence in enumerate(all_split_sentences): stripped_sentence = sentence.strip() if not stripped_sentence or all(char in punctuation for char in stripped_sentence): rprint(f"[yellow]⚠️ Warning: Empty or punctuation-only line detected at index {i}[/yellow]") if i > 0: all_split_sentences[i-1] += sentence continue output_file.write(sentence + "\n") # delete the original file os.remove(SPLIT_BY_CONNECTOR_FILE) rprint(f"[green]💾 Long sentences split by root saved to → {_3_1_SPLIT_BY_NLP}[/green]") if __name__ == "__main__": nlp = init_nlp() split_long_by_root_main(nlp) # raw = "平口さんの盛り上げごまが初めて売れました本当に嬉しいです本当にやっぱり見た瞬間いいって言ってくれるそういうコマを作るのがやっぱりいいですよねその2ヶ月後チコさんが何やらそわそわしていましたなんか気持ち悪いやってきたのは平口さんの駒の評判を聞きつけた愛知県の収集家ですこの男性師匠大沢さんの駒も持っているといいますちょっと褒めすぎかなでも確実にファンは広がっているようです自信がない部分をすごく感じてたのでこれで自信を持って進んでくれるなっていう本当に始まったばっかりこれからいろいろ挑戦していってくれるといいなと思って今月平口さんはある場所を訪れましたこれまで数々のタイトル戦でコマを提供してきた老舗5番手平口さんのコマを扱いたいと言いますいいですねぇ困ってだんだん成長しますので大切に使ってそういう長く良い駒になる駒ですね商談が終わった後店主があるものを取り出しましたこの前の名人戦で使った駒があるんですけど去年、名人銭で使われた盛り上げごま低く盛り上げて品良くするというのは難しい素晴らしいですね平口さんが目指す高みですこういった感じで作れればまだまだですけどただ、多分、咲く。" # nlp = init_nlp() # doc = nlp(raw.strip()) # for sent in split_still_long_sentence(doc): # print(sent, '\n==========') ================================================ FILE: core/st_utils/__init__.py ================================================ ================================================ FILE: core/st_utils/download_video_section.py ================================================ import os import re import shutil import subprocess from time import sleep import streamlit as st from core._1_ytdlp import download_video_ytdlp, find_video_files from core.utils import * from translations.translations import translate as t OUTPUT_DIR = "output" def download_video_section(): st.header(t("a. Download or Upload Video")) with st.container(border=True): try: video_file = find_video_files() st.video(video_file) if st.button(t("Delete and Reselect"), key="delete_video_button"): os.remove(video_file) if os.path.exists(OUTPUT_DIR): shutil.rmtree(OUTPUT_DIR) sleep(1) st.rerun() return True except: col1, col2 = st.columns([3, 1]) with col1: url = st.text_input(t("Enter YouTube link:")) with col2: res_dict = { "360p": "360", "1080p": "1080", "Best": "best" } target_res = load_key("ytb_resolution") res_options = list(res_dict.keys()) default_idx = list(res_dict.values()).index(target_res) if target_res in res_dict.values() else 0 res_display = st.selectbox(t("Resolution"), options=res_options, index=default_idx) res = res_dict[res_display] if st.button(t("Download Video"), key="download_button", width="stretch"): if url: with st.spinner("Downloading video..."): download_video_ytdlp(url, resolution=res) st.rerun() uploaded_file = st.file_uploader(t("Or upload video"), type=load_key("allowed_video_formats") + load_key("allowed_audio_formats")) if uploaded_file: if os.path.exists(OUTPUT_DIR): shutil.rmtree(OUTPUT_DIR) os.makedirs(OUTPUT_DIR, exist_ok=True) raw_name = uploaded_file.name.replace(' ', '_') name, ext = os.path.splitext(raw_name) clean_name = re.sub(r'[^\w\-_\.]', '', name) + ext.lower() with open(os.path.join(OUTPUT_DIR, clean_name), "wb") as f: f.write(uploaded_file.getbuffer()) if ext.lower() in load_key("allowed_audio_formats"): convert_audio_to_video(os.path.join(OUTPUT_DIR, clean_name)) st.rerun() else: return False def convert_audio_to_video(audio_file: str) -> str: output_video = os.path.join(OUTPUT_DIR, 'black_screen.mp4') if not os.path.exists(output_video): print(f"🎵➡️🎬 Converting audio to video with FFmpeg ......") ffmpeg_cmd = ['ffmpeg', '-y', '-f', 'lavfi', '-i', 'color=c=black:s=640x360', '-i', audio_file, '-shortest', '-c:v', 'libx264', '-c:a', 'aac', '-pix_fmt', 'yuv420p', output_video] subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True, encoding='utf-8') print(f"🎵➡️🎬 Converted <{audio_file}> to <{output_video}> with FFmpeg\n") # delete audio file os.remove(audio_file) return output_video ================================================ FILE: core/st_utils/imports_and_utils.py ================================================ import os import streamlit as st import io, zipfile from core.st_utils.download_video_section import download_video_section from core.st_utils.sidebar_setting import page_setting from translations.translations import translate as t def download_subtitle_zip_button(text: str): zip_buffer = io.BytesIO() output_dir = "output" with zipfile.ZipFile(zip_buffer, "w") as zip_file: for file_name in os.listdir(output_dir): if file_name.endswith(".srt"): file_path = os.path.join(output_dir, file_name) with open(file_path, "rb") as file: zip_file.writestr(file_name, file.read()) zip_buffer.seek(0) st.download_button( label=text, data=zip_buffer, file_name="subtitles.zip", mime="application/zip" ) # st.markdown give_star_button = """
Star on GitHub 🌟
""" button_style = """ """ ================================================ FILE: core/st_utils/sidebar_setting.py ================================================ import streamlit as st from translations.translations import translate as t from translations.translations import DISPLAY_LANGUAGES from core.utils import * def config_input(label, key, help=None): """Generic config input handler""" val = st.text_input(label, value=load_key(key), help=help) if val != load_key(key): update_key(key, val) return val def page_setting(): display_language = st.selectbox("Display Language 🌐", options=list(DISPLAY_LANGUAGES.keys()), index=list(DISPLAY_LANGUAGES.values()).index(load_key("display_language"))) if DISPLAY_LANGUAGES[display_language] != load_key("display_language"): update_key("display_language", DISPLAY_LANGUAGES[display_language]) st.rerun() # with st.expander(t("Youtube Settings"), expanded=True): # config_input(t("Cookies Path"), "youtube.cookies_path") with st.expander(t("LLM Configuration"), expanded=True): config_input(t("API_KEY"), "api.key") config_input(t("BASE_URL"), "api.base_url", help=t("Openai format, will add /v1/chat/completions automatically")) c1, c2 = st.columns([4, 1]) with c1: config_input(t("MODEL"), "api.model", help=t("click to check API validity")+ " 👉") with c2: if st.button("📡", key="api"): st.toast(t("API Key is valid") if check_api() else t("API Key is invalid"), icon="✅" if check_api() else "❌") llm_support_json = st.toggle(t("LLM JSON Format Support"), value=load_key("api.llm_support_json"), help=t("Enable if your LLM supports JSON mode output")) if llm_support_json != load_key("api.llm_support_json"): update_key("api.llm_support_json", llm_support_json) st.rerun() with st.expander(t("Subtitles Settings"), expanded=True): c1, c2 = st.columns(2) with c1: langs = { "🇺🇸 English": "en", "🇨🇳 简体中文": "zh", "🇪🇸 Español": "es", "🇷🇺 Русский": "ru", "🇫🇷 Français": "fr", "🇩🇪 Deutsch": "de", "🇮🇹 Italiano": "it", "🇯🇵 日本語": "ja" } lang = st.selectbox( t("Recog Lang"), options=list(langs.keys()), index=list(langs.values()).index(load_key("whisper.language")) ) if langs[lang] != load_key("whisper.language"): update_key("whisper.language", langs[lang]) st.rerun() runtime = st.selectbox(t("WhisperX Runtime"), options=["local", "cloud", "elevenlabs"], index=["local", "cloud", "elevenlabs"].index(load_key("whisper.runtime")), help=t("Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key")) if runtime != load_key("whisper.runtime"): update_key("whisper.runtime", runtime) st.rerun() if runtime == "cloud": config_input(t("WhisperX 302ai API"), "whisper.whisperX_302_api_key") if runtime == "elevenlabs": config_input(("ElevenLabs API"), "whisper.elevenlabs_api_key") with c2: target_language = st.text_input(t("Target Lang"), value=load_key("target_language"), help=t("Input any language in natural language, as long as llm can understand")) if target_language != load_key("target_language"): update_key("target_language", target_language) st.rerun() demucs = st.toggle(t("Vocal separation enhance"), value=load_key("demucs"), help=t("Recommended for videos with loud background noise, but will increase processing time")) if demucs != load_key("demucs"): update_key("demucs", demucs) st.rerun() burn_subtitles = st.toggle(t("Burn-in Subtitles"), value=load_key("burn_subtitles"), help=t("Whether to burn subtitles into the video, will increase processing time")) if burn_subtitles != load_key("burn_subtitles"): update_key("burn_subtitles", burn_subtitles) st.rerun() with st.expander(t("Dubbing Settings"), expanded=True): tts_methods = ["azure_tts", "openai_tts", "fish_tts", "sf_fish_tts", "edge_tts", "gpt_sovits", "custom_tts", "sf_cosyvoice2", "f5tts"] select_tts = st.selectbox(t("TTS Method"), options=tts_methods, index=tts_methods.index(load_key("tts_method"))) if select_tts != load_key("tts_method"): update_key("tts_method", select_tts) st.rerun() # sub settings for each tts method if select_tts == "sf_fish_tts": config_input(t("SiliconFlow API Key"), "sf_fish_tts.api_key") # Add mode selection dropdown mode_options = { "preset": t("Preset"), "custom": t("Refer_stable"), "dynamic": t("Refer_dynamic") } selected_mode = st.selectbox( t("Mode Selection"), options=list(mode_options.keys()), format_func=lambda x: mode_options[x], index=list(mode_options.keys()).index(load_key("sf_fish_tts.mode")) if load_key("sf_fish_tts.mode") in mode_options.keys() else 0 ) if selected_mode != load_key("sf_fish_tts.mode"): update_key("sf_fish_tts.mode", selected_mode) st.rerun() if selected_mode == "preset": config_input("Voice", "sf_fish_tts.voice") elif select_tts == "openai_tts": config_input("302ai API", "openai_tts.api_key") config_input(t("OpenAI Voice"), "openai_tts.voice") elif select_tts == "fish_tts": config_input("302ai API", "fish_tts.api_key") fish_tts_character = st.selectbox(t("Fish TTS Character"), options=list(load_key("fish_tts.character_id_dict").keys()), index=list(load_key("fish_tts.character_id_dict").keys()).index(load_key("fish_tts.character"))) if fish_tts_character != load_key("fish_tts.character"): update_key("fish_tts.character", fish_tts_character) st.rerun() elif select_tts == "azure_tts": config_input("302ai API", "azure_tts.api_key") config_input(t("Azure Voice"), "azure_tts.voice") elif select_tts == "gpt_sovits": st.info(t("Please refer to Github homepage for GPT_SoVITS configuration")) config_input(t("SoVITS Character"), "gpt_sovits.character") refer_mode_options = {1: t("Mode 1: Use provided reference audio only"), 2: t("Mode 2: Use first audio from video as reference"), 3: t("Mode 3: Use each audio from video as reference")} selected_refer_mode = st.selectbox( t("Refer Mode"), options=list(refer_mode_options.keys()), format_func=lambda x: refer_mode_options[x], index=list(refer_mode_options.keys()).index(load_key("gpt_sovits.refer_mode")), help=t("Configure reference audio mode for GPT-SoVITS") ) if selected_refer_mode != load_key("gpt_sovits.refer_mode"): update_key("gpt_sovits.refer_mode", selected_refer_mode) st.rerun() elif select_tts == "edge_tts": config_input(t("Edge TTS Voice"), "edge_tts.voice") elif select_tts == "sf_cosyvoice2": config_input(t("SiliconFlow API Key"), "sf_cosyvoice2.api_key") elif select_tts == "f5tts": config_input("302ai API", "f5tts.302_api") def check_api(): try: resp = ask_gpt("This is a test, response 'message':'success' in json format.", resp_type="json", log_title='None') return resp.get('message') == 'success' except Exception: return False if __name__ == "__main__": check_api() ================================================ FILE: core/translate_lines.py ================================================ from core.prompts import generate_shared_prompt, get_prompt_faithfulness, get_prompt_expressiveness from rich.panel import Panel from rich.console import Console from rich.table import Table from rich import box from core.utils import * console = Console() def valid_translate_result(result: dict, required_keys: list, required_sub_keys: list): # Check for the required key if not all(key in result for key in required_keys): return {"status": "error", "message": f"Missing required key(s): {', '.join(set(required_keys) - set(result.keys()))}"} # Check for required sub-keys in all items for key in result: if not all(sub_key in result[key] for sub_key in required_sub_keys): return {"status": "error", "message": f"Missing required sub-key(s) in item {key}: {', '.join(set(required_sub_keys) - set(result[key].keys()))}"} return {"status": "success", "message": "Translation completed"} def translate_lines(lines, previous_content_prompt, after_cotent_prompt, things_to_note_prompt, summary_prompt, index = 0): shared_prompt = generate_shared_prompt(previous_content_prompt, after_cotent_prompt, summary_prompt, things_to_note_prompt) # Retry translation if the length of the original text and the translated text are not the same, or if the specified key is missing def retry_translation(prompt, length, step_name): def valid_faith(response_data): return valid_translate_result(response_data, [str(i) for i in range(1, length+1)], ['direct']) def valid_express(response_data): return valid_translate_result(response_data, [str(i) for i in range(1, length+1)], ['free']) for retry in range(3): if step_name == 'faithfulness': result = ask_gpt(prompt+retry* " ", resp_type='json', valid_def=valid_faith, log_title=f'translate_{step_name}') elif step_name == 'expressiveness': result = ask_gpt(prompt+retry* " ", resp_type='json', valid_def=valid_express, log_title=f'translate_{step_name}') if len(lines.split('\n')) == len(result): return result if retry != 2: console.print(f'[yellow]⚠️ {step_name.capitalize()} translation of block {index} failed, Retry...[/yellow]') raise ValueError(f'[red]❌ {step_name.capitalize()} translation of block {index} failed after 3 retries. Please check `output/gpt_log/error.json` for more details.[/red]') ## Step 1: Faithful to the Original Text prompt1 = get_prompt_faithfulness(lines, shared_prompt) faith_result = retry_translation(prompt1, len(lines.split('\n')), 'faithfulness') for i in faith_result: faith_result[i]["direct"] = faith_result[i]["direct"].replace('\n', ' ') # If reflect_translate is False or not set, use faithful translation directly reflect_translate = load_key('reflect_translate') if not reflect_translate: # If reflect_translate is False or not set, use faithful translation directly translate_result = "\n".join([faith_result[i]["direct"].strip() for i in faith_result]) table = Table(title="Translation Results", show_header=False, box=box.ROUNDED) table.add_column("Translations", style="bold") for i, key in enumerate(faith_result): table.add_row(f"[cyan]Origin: {faith_result[key]['origin']}[/cyan]") table.add_row(f"[magenta]Direct: {faith_result[key]['direct']}[/magenta]") if i < len(faith_result) - 1: table.add_row("[yellow]" + "-" * 50 + "[/yellow]") console.print(table) return translate_result, lines ## Step 2: Express Smoothly prompt2 = get_prompt_expressiveness(faith_result, lines, shared_prompt) express_result = retry_translation(prompt2, len(lines.split('\n')), 'expressiveness') table = Table(title="Translation Results", show_header=False, box=box.ROUNDED) table.add_column("Translations", style="bold") for i, key in enumerate(express_result): table.add_row(f"[cyan]Origin: {faith_result[key]['origin']}[/cyan]") table.add_row(f"[magenta]Direct: {faith_result[key]['direct']}[/magenta]") table.add_row(f"[green]Free: {express_result[key]['free']}[/green]") if i < len(express_result) - 1: table.add_row("[yellow]" + "-" * 50 + "[/yellow]") console.print(table) translate_result = "\n".join([express_result[i]["free"].replace('\n', ' ').strip() for i in express_result]) if len(lines.split('\n')) != len(translate_result.split('\n')): console.print(Panel(f'[red]❌ Translation of block {index} failed, Length Mismatch, Please check `output/gpt_log/translate_expressiveness.json`[/red]')) raise ValueError(f'Origin ···{lines}···,\nbut got ···{translate_result}···') return translate_result, lines if __name__ == '__main__': # test e.g. lines = '''All of you know Andrew Ng as a famous computer science professor at Stanford. He was really early on in the development of neural networks with GPUs. Of course, a creator of Coursera and popular courses like deeplearning.ai. Also the founder and creator and early lead of Google Brain.''' previous_content_prompt = None after_cotent_prompt = None things_to_note_prompt = None summary_prompt = None translate_lines(lines, previous_content_prompt, after_cotent_prompt, things_to_note_prompt, summary_prompt) ================================================ FILE: core/tts_backend/_302_f5tts.py ================================================ import http.client import json import os import requests from pydub import AudioSegment from core.asr_backend.audio_preprocess import normalize_audio_volume from core.utils import * from core.utils.models import * API_KEY = load_key("f5tts.302_api") UPLOADED_REFER_URL = None def upload_file_to_302(file_path): API_KEY = load_key("f5tts.302_api") url = "https://api.302.ai/302/upload-file" files = [('file', (os.path.basename(file_path), open(file_path, 'rb'), 'application/octet-stream'))] headers = {'Authorization': f'Bearer {API_KEY}'} response = requests.request("POST", url, headers=headers, data={}, files=files) if response.status_code == 200: response_data = response.json() if response_data.get('code') == 200: return response_data.get('data') return None return None def _f5_tts(text: str, refer_url: str, save_path: str) -> bool: conn = http.client.HTTPSConnection("api.302.ai") payload = json.dumps({"gen_text": text, "ref_audio_url": refer_url, "model_type": "F5-TTS"}) headers = {'Authorization': f'Bearer {API_KEY}', 'Content-Type': 'application/json'} conn.request("POST", "/302/submit/f5-tts", payload, headers) res = conn.getresponse() data = json.loads(res.read().decode("utf-8")) if "audio_url" in data and "url" in data["audio_url"]: # Download audio file audio_url = data["audio_url"]["url"] audio_conn = http.client.HTTPSConnection("file.302.ai") audio_conn.request("GET", audio_url.replace("https://file.302.ai", "")) audio_res = audio_conn.getresponse() with open(save_path, "wb") as f: f.write(audio_res.read()) print(f"Audio file saved to {save_path}") return True print("Request failed:", data) return False def _merge_audio(files, output: str) -> bool: """Merge audio files, add a brief silence""" try: # Create an empty audio segment combined = AudioSegment.empty() silence = AudioSegment.silent(duration=100) # 100ms silence # Add audio files one by one for file in files: audio = AudioSegment.from_wav(file) combined += audio + silence combined += silence combined.export(output, format="wav", parameters=["-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1"]) if os.path.getsize(output) == 0: rprint(f"[red]Output file size is 0") return False rprint(f"[green]Successfully merged audio files") return True except Exception as e: rprint(f"[red]Failed to merge audio: {str(e)}") return False def _get_ref_audio(task_df, min_duration=8, max_duration=14.5) -> str: """Get reference audio, ensuring the combined audio duration is > min_duration and < max_duration""" rprint(f"[blue]🎯 Starting reference audio selection process...") duration = 0 selected = [] for _, row in task_df.iterrows(): current_duration = row['duration'] # Skip if adding this segment would exceed max duration if current_duration + duration > max_duration: continue # Add segments until we exceed min duration selected.append(row) duration += current_duration # Once we exceed min duration and are under max, we're done if duration > min_duration and duration < max_duration: break if not selected: rprint(f"[red]❌ No valid segments found (could not reach minimum {min_duration}s duration)") return None rprint(f"[blue]📊 Selected {len(selected)} segments, total duration: {duration:.2f}s") audio_files = [f"{_AUDIO_REFERS_DIR}/{row['number']}.wav" for row in selected] rprint(f"[yellow]🎵 Audio files to merge: {audio_files}") combined_audio = f"{_AUDIO_REFERS_DIR}/refer.wav" success = _merge_audio(audio_files, combined_audio) if not success: rprint(f"[red]❌ Error: Failed to merge audio files") return False rprint(f"[green]✅ Successfully created combined audio: {combined_audio}") return combined_audio def f5_tts_for_videolingo(text: str, save_as: str, number: int, task_df): global UPLOADED_REFER_URL # Only process the reference audio if we haven't uploaded it yet if UPLOADED_REFER_URL is None: refer_path = _get_ref_audio(task_df) normalized_refer_path = normalize_audio_volume(refer_path, f"{_AUDIO_REFERS_DIR}/refer_normalized.wav") UPLOADED_REFER_URL = upload_file_to_302(normalized_refer_path) rprint(f"[green]✅ Reference audio uploaded, URL cached for reuse") try: success = _f5_tts(text=text, refer_url=UPLOADED_REFER_URL, save_path=save_as) return success except Exception as e: print(f"Error in f5_tts_for_videolingo: {str(e)}") return False if __name__ == "__main__": test_refer_url = "https://file.302.ai/gpt/imgs/20250226/717e574dc8e440e3b6f8cb4b3acb40e0.mp3" test_text = "Hello, world!" test_save_as = "test_f5_tts.wav" success = _f5_tts(text=test_text, refer_url=test_refer_url, save_path=test_save_as) print(f"Test result: {success}") ================================================ FILE: core/tts_backend/azure_tts.py ================================================ import requests from core.utils import load_key def azure_tts(text: str, save_path: str) -> None: url = "https://api.302.ai/cognitiveservices/v1" API_KEY = load_key("azure_tts.api_key") voice = load_key("azure_tts.voice") payload = f"""{text}""" headers = { 'Authorization': f'Bearer {API_KEY}', 'X-Microsoft-OutputFormat': 'riff-16khz-16bit-mono-pcm', 'Content-Type': 'application/ssml+xml' } response = requests.request("POST", url, headers=headers, data=payload) with open(save_path, 'wb') as f: f.write(response.content) print(f"Audio saved to {save_path}") if __name__ == "__main__": azure_tts("Hi! Welcome to VideoLingo!", "test.wav") ================================================ FILE: core/tts_backend/custom_tts.py ================================================ from pathlib import Path def custom_tts(text, save_path): """ Custom TTS (Text-to-Speech) interface Args: text (str): Text to be converted to speech save_path (str): Path to save the audio file Returns: None Example: custom_tts("Hello world", "output.wav") """ # Ensure save directory exists speech_file_path = Path(save_path) speech_file_path.parent.mkdir(parents=True, exist_ok=True) try: # TODO: Implement your custom TTS logic here # 1. Initialize your TTS client/model # 2. Convert text to speech # 3. Save the audio file to the specified path pass print(f"Audio saved to {speech_file_path}") except Exception as e: print(f"Error occurred during TTS conversion: {str(e)}") if __name__ == "__main__": # Test example custom_tts("This is a test.", "custom_tts_test.wav") ================================================ FILE: core/tts_backend/edge_tts.py ================================================ from pathlib import Path import edge_tts from core.utils import * import subprocess # Available voices can be listed using edge-tts --list-voices command # Common English voices: # en-US-JennyNeural - Female # en-US-GuyNeural - Male # en-GB-SoniaNeural - Female British # Common Chinese voices: # zh-CN-XiaoxiaoNeural - Female # zh-CN-YunxiNeural - Male # zh-CN-XiaoyiNeural - Female def edge_tts(text, save_path): # Load settings from config file edge_set = load_key("edge_tts") voice = edge_set.get("voice", "en-US-JennyNeural") # Create output directory if it doesn't exist speech_file_path = Path(save_path) speech_file_path.parent.mkdir(parents=True, exist_ok=True) cmd = ["edge-tts", "--voice", voice, "--text", text, "--write-media", str(speech_file_path)] subprocess.run(cmd, check=True) print(f"Audio saved to {speech_file_path}") if __name__ == "__main__": edge_tts("Today is a good day!", "edge_tts.wav") ================================================ FILE: core/tts_backend/estimate_duration.py ================================================ import syllables from pypinyin import pinyin, Style from g2p_en import G2p from typing import Optional import re class AdvancedSyllableEstimator: def __init__(self): self.g2p_en = G2p() self.duration_params = {'en': 0.225, 'zh': 0.21, 'ja': 0.21, 'fr': 0.22, 'es': 0.22, 'ko': 0.21, 'default': 0.22} self.lang_patterns = { 'zh': r'[\u4e00-\u9fff]', 'ja': r'[\u3040-\u309f\u30a0-\u30ff]', 'fr': r'[àâçéèêëîïôùûüÿœæ]', 'es': r'[áéíóúñ¿¡]', 'en': r'[a-zA-Z]+', 'ko': r'[\uac00-\ud7af\u1100-\u11ff]'} self.lang_joiners = {'zh': '', 'ja': '', 'en': ' ', 'fr': ' ', 'es': ' ', 'ko': ' '} self.punctuation = { 'mid': r'[,;:,;、]+', 'end': r'[。!?.!?]+', 'space': r'\s+', 'pause': {'space': 0.15, 'default': 0.1} } def estimate_duration(self, text: str, lang: Optional[str] = None) -> float: syllable_count = self.count_syllables(text, lang) return syllable_count * self.duration_params.get(lang or 'default') def count_syllables(self, text: str, lang: Optional[str] = None) -> int: if not text.strip(): return 0 lang = lang or self._detect_language(text) vowels_map = { 'fr': 'aeiouyàâéèêëîïôùûüÿœæ', 'es': 'aeiouáéíóúü' } if lang == 'en': return self._count_english_syllables(text) elif lang == 'zh': text = re.sub(r'[^\u4e00-\u9fff]', '', text) return len(pinyin(text, style=Style.NORMAL)) elif lang == 'ja': text = re.sub(r'[きぎしじちぢにひびぴみり][ょゅゃ]', 'X', text) text = re.sub(r'[っー]', '', text) return len(re.findall(r'[\u3040-\u309f\u30a0-\u30ff\u4e00-\u9fff]', text)) elif lang in ('fr', 'es'): text = re.sub(r'e\b', '', text.lower()) if lang == 'fr' else text.lower() return max(1, len(re.findall(f'[{vowels_map[lang]}]+', text))) elif lang == 'ko': return len(re.findall(r'[\uac00-\ud7af]', text)) return len(text.split()) def _count_english_syllables(self, text: str) -> int: total = 0 for word in text.strip().split(): try: total += syllables.estimate(word) except: phones = self.g2p_en(word) total += max(1, len([p for p in phones if any(c in p for c in 'aeiou')])) return max(1, total) def _detect_language(self, text: str) -> str: for lang, pattern in self.lang_patterns.items(): if re.search(pattern, text): return lang return 'en' def process_mixed_text(self, text: str) -> dict: if not text or not isinstance(text, str): return { 'language_breakdown': {}, 'total_syllables': 0, 'punctuation': [], 'spaces': [], 'estimated_duration': 0 } result = {'language_breakdown': {}, 'total_syllables': 0, 'punctuation': [], 'spaces': []} segments = re.split(f"({self.punctuation['space']}|{self.punctuation['mid']}|{self.punctuation['end']})", text) total_duration = 0 for i, segment in enumerate(segments): if not segment: continue if re.match(self.punctuation['space'], segment): prev_lang = self._detect_language(segments[i-1]) if i > 0 else None next_lang = self._detect_language(segments[i+1]) if i < len(segments)-1 else None if prev_lang and next_lang and (self.lang_joiners[prev_lang] == '' or self.lang_joiners[next_lang] == ''): result['spaces'].append(segment) total_duration += self.punctuation['pause']['space'] elif re.match(f"{self.punctuation['mid']}|{self.punctuation['end']}", segment): result['punctuation'].append(segment) total_duration += self.punctuation['pause']['default'] else: lang = self._detect_language(segment) if lang: syllables = self.count_syllables(segment, lang) if lang not in result['language_breakdown']: result['language_breakdown'][lang] = {'syllables': 0, 'text': ''} result['language_breakdown'][lang]['syllables'] += syllables result['language_breakdown'][lang]['text'] += (self.lang_joiners[lang] + segment if result['language_breakdown'][lang]['text'] else segment) result['total_syllables'] += syllables total_duration += syllables * self.duration_params.get(lang, self.duration_params['default']) result['estimated_duration'] = total_duration return result def init_estimator(): return AdvancedSyllableEstimator() def estimate_duration(text: str, estimator: AdvancedSyllableEstimator): if not text or not isinstance(text, str): return 0 return estimator.process_mixed_text(text)['estimated_duration'] # 使用示例 if __name__ == "__main__": estimator = init_estimator() print(estimate_duration('你好', estimator)) # 测试用例 test_cases = [ # "Hello world this is a test", # 纯英文 # "你好世界 这是一个测试", # 中文带空格 # "Hello 你好 world 世界", # 中英混合 # "The weather is nice 所以我们去公园", # 中英混合带空格 # "我们需要在输出中体现空格的停顿时间", # "I couldn't help but notice the vibrant colors of the autumn leaves cascading gently from the trees" "가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다" ] for text in test_cases: result = estimator.process_mixed_text(text) print(f"\nText: {text}") print(f"Total syllables: {result['total_syllables']}") print(f"Estimated duration: {result['estimated_duration']:.2f}s") print("Language breakdown:") for lang, info in result['language_breakdown'].items(): print(f"- {lang}: {info['syllables']} syllables ({info['text']})") print(f"Punctuation: {result['punctuation']}") print(f"Spaces: {result['spaces']}") ================================================ FILE: core/tts_backend/fish_tts.py ================================================ import requests from core.utils import * import json @except_handler("Failed to generate audio using 302.ai Fish TTS", retry=3, delay=1) def fish_tts(text: str, save_as: str) -> bool: """302.ai Fish TTS conversion""" API_KEY = load_key("fish_tts.api_key") character = load_key("fish_tts.character") refer_id = load_key("fish_tts.character_id_dict")[character] url = "https://api.302.ai/fish-audio/v1/tts" payload = json.dumps({ "text": text, "reference_id": refer_id, "chunk_length": 200, "normalize": True, "format": "wav", "latency": "normal" }) headers = {'Authorization': f'Bearer {API_KEY}', 'Content-Type': 'application/json'} response = requests.post(url, headers=headers, data=payload) response.raise_for_status() response_data = response.json() if "url" in response_data: audio_response = requests.get(response_data["url"]) audio_response.raise_for_status() with open(save_as, "wb") as f: f.write(audio_response.content) return True print("Request failed:", response_data) return False if __name__ == '__main__': fish_tts("Hi! Welcome to VideoLingo!", "test.wav") ================================================ FILE: core/tts_backend/gpt_sovits_tts.py ================================================ from pathlib import Path import requests import os, sys import subprocess import socket import time from core.utils import * def check_lang(text_lang, prompt_lang): # only support zh and en if any(lang in text_lang.lower() for lang in ['zh', 'cn', '中文', 'chinese']): text_lang = 'zh' elif any(lang in text_lang.lower() for lang in ['英文', '英语', 'english']): text_lang = 'en' else: raise ValueError("Unsupported text language. Only Chinese and English are supported.") if any(lang in prompt_lang.lower() for lang in ['en', 'english', '英文', '英语']): prompt_lang = 'en' elif any(lang in prompt_lang.lower() for lang in ['zh', 'cn', '中文', 'chinese']): prompt_lang = 'zh' else: raise ValueError("Unsupported prompt language. Only Chinese and English are supported.") return text_lang, prompt_lang def gpt_sovits_tts(text, text_lang, save_path, ref_audio_path, prompt_lang, prompt_text): text_lang, prompt_lang = check_lang(text_lang, prompt_lang) current_dir = Path.cwd() payload = { 'text': text, 'text_lang': text_lang, 'ref_audio_path': str(ref_audio_path), 'prompt_lang': prompt_lang, 'prompt_text': prompt_text, "speed_factor": 1.0, } def save_audio(response, save_path, current_dir): if save_path: full_save_path = current_dir / save_path full_save_path.parent.mkdir(parents=True, exist_ok=True) full_save_path.write_bytes(response.content) rprint(f"[bold green]Audio saved successfully:[/bold green] {full_save_path}") return True response = requests.post('http://127.0.0.1:9880/tts', json=payload) if response.status_code == 200: return save_audio(response, save_path, current_dir) else: rprint(f"[bold red]TTS request failed, status code:[/bold red] {response.status_code}") return False def gpt_sovits_tts_for_videolingo(text, save_as, number, task_df): start_gpt_sovits_server() TARGET_LANGUAGE = load_key("target_language") WHISPER_LANGUAGE = load_key("whisper.language") sovits_set = load_key("gpt_sovits") DUBBING_CHARACTER = sovits_set["character"] REFER_MODE = sovits_set["refer_mode"] current_dir = Path.cwd() prompt_lang = load_key("whisper.detected_language") if WHISPER_LANGUAGE == 'auto' else WHISPER_LANGUAGE prompt_text = task_df.loc[task_df['number'] == number, 'origin'].values[0] if REFER_MODE == 1: # Use the default reference audio from config _, config_path = find_and_check_config_path(DUBBING_CHARACTER) config_dir = config_path.parent # Find reference audio file ref_audio_files = list(config_dir.glob(f"{DUBBING_CHARACTER}_*.wav")) + list(config_dir.glob(f"{DUBBING_CHARACTER}_*.mp3")) if not ref_audio_files: raise FileNotFoundError(f"No reference audio file found for {DUBBING_CHARACTER}") ref_audio_path = ref_audio_files[0] # Extract content from filename content = ref_audio_path.stem.split('_', 1)[1] #! Check. Only support zh and en. prompt_lang = 'zh' if any('\u4e00' <= char <= '\u9fff' for char in content) else 'en' print(f"Detected language: {prompt_lang}") prompt_text = content elif REFER_MODE in [2, 3]: # Check if the reference audio file exists ref_audio_path = current_dir / ("output/audio/refers/1.wav" if REFER_MODE == 2 else f"output/audio/refers/{number}.wav") if not ref_audio_path.exists(): # If the file does not exist, try to extract the reference audio try: from core._9_refer_audio import extract_refer_audio_main rprint(f"[yellow]Reference audio file does not exist, attempting extraction: {ref_audio_path}[/yellow]") extract_refer_audio_main() except Exception as e: rprint(f"[bold red]Failed to extract reference audio: {str(e)}[/bold red]") raise else: raise ValueError("Invalid REFER_MODE. Choose 1, 2, or 3.") success = gpt_sovits_tts(text, TARGET_LANGUAGE, save_as, ref_audio_path, prompt_lang, prompt_text) if not success and REFER_MODE == 3: rprint(f"[bold red]TTS request failed, switching back to mode 2 and retrying[/bold red]") ref_audio_path = current_dir / "output/audio/refers/1.wav" gpt_sovits_tts(text, TARGET_LANGUAGE, save_as, ref_audio_path, prompt_lang, prompt_text) def find_and_check_config_path(dubbing_character): current_dir = Path(__file__).resolve().parent.parent.parent parent_dir = current_dir.parent # Find the GPT-SoVITS-v2 directory gpt_sovits_dir = next((d for d in parent_dir.iterdir() if d.is_dir() and d.name.startswith('GPT-SoVITS-v2')), None) if gpt_sovits_dir is None: raise FileNotFoundError("GPT-SoVITS-v2 directory not found in the parent directory.") config_path = gpt_sovits_dir / "GPT_SoVITS" / "configs" / f"{dubbing_character}.yaml" if not config_path.exists(): raise FileNotFoundError(f"Config file not found at {config_path}") return gpt_sovits_dir, config_path def start_gpt_sovits_server(): current_dir = Path(__file__).resolve().parent.parent.parent # Check if port 9880 is already in use sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) result = sock.connect_ex(('127.0.0.1', 9880)) if result == 0: sock.close() return None sock.close() rprint("[bold yellow]🚀 Initializing GPT-SoVITS Server...[/bold yellow]") rprint("[bold yellow]🚀 正在初始化 GPT-SoVITS 服务器...[/bold yellow]") rprint("""[bold red]⏳ Please wait approximately 1 minute • A new command prompt will appear for the GPT-SoVITS API • Any `404 not found` warnings during startup are normal, please be patient[/bold red]""") rprint("""[bold red]⏳ 请等待大约1分钟 • GPT-SoVITS API 将会打开一个新的命令提示符窗口 • 启动过程中出现 `404 not found` 警告是正常的,请耐心等待[/bold red]""") # Find and check config path gpt_sovits_dir, config_path = find_and_check_config_path(load_key("gpt_sovits.character")) # Change to the GPT-SoVITS-v2 directory os.chdir(gpt_sovits_dir) # Start the GPT-SoVITS server if sys.platform == "win32": cmd = [ "runtime\\python.exe", "api_v2.py", "-a", "127.0.0.1", "-p", "9880", "-c", str(config_path) ] # Open the command in a new window on Windows process = subprocess.Popen(cmd, creationflags=subprocess.CREATE_NEW_CONSOLE) elif sys.platform == "darwin": # macOS print("Please manually start the GPT-SoVITS server at http://127.0.0.1:9880, refer to api_v2.py.") while True: user_input = input("Have you started the server? (y/n): ").lower() if user_input == 'y': process = None break elif user_input == 'n': raise Exception("Please start the server before continuing.") else: raise OSError("Unsupported operating system. Only Windows and macOS are supported.") # Change back to the original directory os.chdir(current_dir) # Wait for the server to start (max 30 seconds) start_time = time.time() while time.time() - start_time < 50: try: time.sleep(15) response = requests.get('http://127.0.0.1:9880/ping') if response.status_code == 200: print("GPT-SoVITS server is ready.") return process except requests.exceptions.RequestException: pass raise Exception("GPT-SoVITS server failed to start within 50 seconds. Please check if GPT-SoVITS-v2-xxx folder is set correctly.") ================================================ FILE: core/tts_backend/openai_tts.py ================================================ from pathlib import Path import requests import json from core.utils import load_key, except_handler BASE_URL = "https://api.302.ai/v1/audio/speech" VOICE_LIST = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] # voice options: alloy, echo, fable, onyx, nova, and shimmer # refer to: https://platform.openai.com/docs/guides/text-to-speech/quickstart @except_handler("Failed to generate audio using OpenAI TTS", retry=3, delay=1) def openai_tts(text, save_path): API_KEY = load_key("openai_tts.api_key") voice = load_key("openai_tts.voice") payload = json.dumps({ "model": "tts-1", "input": text, "voice": voice, "response_format": "wav" }) if voice not in VOICE_LIST: raise ValueError(f"Invalid voice: {voice}. Please choose from {VOICE_LIST}") headers = {'Authorization': f"Bearer {API_KEY}", 'Content-Type': 'application/json'} speech_file_path = Path(save_path) speech_file_path.parent.mkdir(parents=True, exist_ok=True) response = requests.post(BASE_URL, headers=headers, data=payload) if response.status_code == 200: with open(speech_file_path, 'wb') as f: f.write(response.content) print(f"Audio saved to {speech_file_path}") else: print(f"Error: {response.status_code}") print(response.text) if __name__ == "__main__": openai_tts("Hi! Welcome to VideoLingo!", "test.wav") ================================================ FILE: core/tts_backend/sf_cosyvoice2.py ================================================ from openai import OpenAI from pathlib import Path import base64 from core.utils import * def wav_to_base64(wav_file_path): with open(wav_file_path, 'rb') as audio_file: audio_content = audio_file.read() base64_audio = base64.b64encode(audio_content).decode('utf-8') return base64_audio @except_handler("Failed to generate audio using SiliconFlow TTS") def cosyvoice_tts_for_videolingo(text, save_as, number, task_df): prompt_text = task_df.loc[task_df['number'] == number, 'origin'].values[0] API_KEY = load_key("sf_cosyvoice2.api_key") # 设置参考音频路径 current_dir = Path.cwd() ref_audio_path = current_dir / f"output/audio/refers/{number}.wav" # 如果参考音频不存在,使用第一个音频作为备选 if not ref_audio_path.exists(): ref_audio_path = current_dir / "output/audio/refers/1.wav" if not ref_audio_path.exists(): try: from core._9_refer_audio import extract_refer_audio_main print(f"参考音频文件不存在,尝试提取: {ref_audio_path}") extract_refer_audio_main() except Exception as e: print(f"提取参考音频失败: {str(e)}") raise reference_base64 = wav_to_base64(ref_audio_path) client = OpenAI(api_key=API_KEY, base_url="https://api.siliconflow.cn/v1") save_path = Path(save_as) save_path.parent.mkdir(parents=True, exist_ok=True) with client.audio.speech.with_streaming_response.create( model="FunAudioLLM/CosyVoice2-0.5B", voice="", input=text, response_format="wav", extra_body={"references": [{"audio": f"data:audio/wav;base64,{reference_base64}", "text": prompt_text}]} ) as response: response.stream_to_file(save_path) print(f"音频已成功保存至: {save_path}") return True ================================================ FILE: core/tts_backend/sf_fishtts.py ================================================ import os import time import uuid import base64 import hashlib import requests from pathlib import Path from pydub import AudioSegment from rich.panel import Panel from rich.text import Text from core._1_ytdlp import find_video_files from core.asr_backend.audio_preprocess import get_audio_duration from core.utils import * from core.utils.models import * API_URL_SPEECH = "https://api.siliconflow.cn/v1/audio/speech" API_URL_VOICE = "https://api.siliconflow.cn/v1/uploads/audio/voice" MODEL_NAME = "fishaudio/fish-speech-1.4" REFER_MAX_LENGTH = 90 @except_handler("Failed to generate audio using SiliconFlow Fish TTS", retry=2, delay=1) def siliconflow_fish_tts(text, save_path, mode="preset", voice_id=None, ref_audio=None, ref_text=None, check_duration=False): sf_fish_set = load_key("sf_fish_tts") headers = {"Authorization": f'Bearer {sf_fish_set["api_key"]}', "Content-Type": "application/json"} payload = {"model": MODEL_NAME, "response_format": "wav", "stream": False, "input": text} if mode == "preset": payload["voice"] = f"fishaudio/fish-speech-1.4:{sf_fish_set['voice']}" elif mode == "custom": if not voice_id: raise ValueError("custom mode requires voice_id") payload["voice"] = voice_id elif mode == "dynamic": if not ref_audio or not ref_text: raise ValueError("dynamic mode requires ref_audio and ref_text") with open(ref_audio, 'rb') as f: audio_base64 = base64.b64encode(f.read()).decode('utf-8') payload = { "model": MODEL_NAME, "response_format": "wav", "stream": False, "input": text, "voice": None, "references": [{"audio": f"data:audio/wav;base64,{audio_base64}", "text": ref_text}] } else: raise ValueError("Invalid mode") response = requests.post(API_URL_SPEECH, json=payload, headers=headers) if response.status_code == 200: wav_file_path = Path(save_path).with_suffix('.wav') wav_file_path.parent.mkdir(parents=True, exist_ok=True) with open(wav_file_path, 'wb') as f: f.write(response.content) if check_duration: duration = get_audio_duration(wav_file_path) rprint(f"[blue]Audio Duration: {duration:.2f} seconds") rprint(f"[green]Successfully generated audio file: {wav_file_path}") return True error_msg = response.json() rprint(f"[red]Failed to generate audio | HTTP {response.status_code} (Attempt {attempt + 1}/{max_retries})") rprint(f"[red]Text: {text}") rprint(f"[red]Error details: {error_msg}") return False @except_handler("Failed to create custom voice") def create_custom_voice(audio_path, text, custom_name=None): if not Path(audio_path).exists(): raise FileNotFoundError(f"Audio file not found at {audio_path}") audio_base64 = f"data:audio/wav;base64,{base64.b64encode(open(audio_path, 'rb').read()).decode('utf-8')}" rprint(f"[yellow]✅ Successfully encoded audio file") payload = { "audio": audio_base64, "model": MODEL_NAME, "customName": custom_name or str(uuid.uuid4())[:8], "text": text } rprint(f"[yellow]🚀 Sending request to create voice...") response = requests.post(API_URL_VOICE, json=payload, headers={"Authorization": f'Bearer {load_key["sf_fish_tts"]["api_key"]}', "Content-Type": "application/json"}) response_json = response.json() if response.status_code == 200: voice_id = response_json.get('uri') status_text = Text() status_text.append("✨ Successfully created custom voice!\n", style="green") status_text.append(f"🎙️ Voice ID: {voice_id}\n", style="green") status_text.append(f"⌛ Creation Time: {time.strftime('%Y-%m-%d %H:%M:%S')}", style="green") rprint(Panel(status_text, title="Voice Creation Status")) return voice_id error_text = Text() error_text.append("❌ Failed to create custom voice\n", style="red") error_text.append(f"⚠️ HTTP Status: {response.status_code}\n", style="red") error_text.append(f"💬 Error Details: {response_json}", style="red") rprint(Panel(error_text, title="Error", border_style="red")) raise ValueError(f"Failed to create custom voice 🚫 HTTP {response.status_code}, Error details: {response_json}") @except_handler("Failed to merge audio") def merge_audio(files, output): """Merge audio files, add a brief silence""" # Create an empty audio segment combined = AudioSegment.empty() silence = AudioSegment.silent(duration=100) # 100ms silence # Add audio files one by one for file in files: audio = AudioSegment.from_wav(file) combined += audio + silence # Export the combined file combined.export(output, format="wav", parameters=["-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1"]) if os.path.getsize(output) == 0: rprint(f"[red]Output file size is 0") return False rprint(f"[green]Successfully merged audio files") return True def get_ref_audio(task_df): """Get reference audio and text, ensuring the combined text length does not exceed 100 characters""" rprint(f"[blue]🎯 Starting reference audio selection process...") duration = 0 selected = [] combined_text = "" found_first = False for _, row in task_df.iterrows(): current_text = row['origin'] # If no valid record has been found yet if not found_first: if len(current_text) <= REFER_MAX_LENGTH: selected.append(row) combined_text = current_text duration += row['duration'] found_first = True rprint(f"[yellow]📝 Found first valid row: {current_text[:50]}...") else: rprint(f"[yellow]⏭️ Skipping long row: {current_text[:50]}... ({len(current_text)} chars)") continue # Check subsequent rows new_text = combined_text + " " + current_text if len(new_text) > REFER_MAX_LENGTH: break selected.append(row) combined_text = new_text duration += row['duration'] rprint(f"[yellow]📝 Added row: {current_text[:50]}...") if duration > 10: break if not selected: rprint(f"[red]❌ No valid segments found (all texts exceed {REFER_MAX_LENGTH} characters)") return None, None rprint(f"[blue]📊 Selected {len(selected)} segments, total duration: {duration:.2f}s") audio_files = [f"{_AUDIO_REFERS_DIR}/{row['number']}.wav" for row in selected] rprint(f"[yellow]🎵 Audio files to merge: {audio_files}") combined_audio = f"{_AUDIO_REFERS_DIR}/combined_reference.wav" success = merge_audio(audio_files, combined_audio) if not success: rprint(f"[red]❌ Error: Failed to merge audio files") return None, None rprint(f"[green]✅ Successfully created combined audio: {combined_audio}") rprint(f"[green]📝 Final combined text: {combined_text} | Length: {len(combined_text)}") return combined_audio, combined_text def siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df): sf_fish_set = load_key("sf_fish_tts") MODE = sf_fish_set["mode"] if MODE == "preset": return siliconflow_fish_tts(text, save_as, mode="preset") elif MODE == "custom": video_file = find_video_files() custom_name = hashlib.md5(video_file.encode()).hexdigest()[:8] rprint(f"[yellow]Using custom name: {custom_name}") log_name = load_key("sf_fish_tts.custom_name") if log_name != custom_name: # Get the merged reference audio and text ref_audio, ref_text = get_ref_audio(task_df) if ref_audio is None or ref_text is None: rprint(f"[red]Failed to get reference audio and text, falling back to preset mode") return siliconflow_fish_tts(text, save_as, mode="preset") voice_id = create_custom_voice(ref_audio, ref_text, custom_name) update_key("sf_fish_tts.voice_id", voice_id) update_key("sf_fish_tts.custom_name", custom_name) else: voice_id = load_key("sf_fish_tts.voice_id") return siliconflow_fish_tts(text=text, save_path=save_as, mode="custom", voice_id=voice_id) elif MODE == "dynamic": ref_audio_path = f"{_AUDIO_REFERS_DIR}/{number}.wav" if not Path(ref_audio_path).exists(): rprint(f"[red]Reference audio not found: {ref_audio_path}, falling back to preset mode") return siliconflow_fish_tts(text, save_as, mode="preset") ref_text = task_df[task_df['number'] == number]['origin'].iloc[0] return siliconflow_fish_tts(text=text, save_path=save_as, mode="dynamic", ref_audio=str(ref_audio_path), ref_text=ref_text) else: raise ValueError("Invalid mode. Choose 'preset', 'custom', or 'dynamic'") if __name__ == '__main__': pass # create_custom_voice("output/audio/refers/1.wav", "Okay folks, welcome back. This is price action model number four, position trading.") siliconflow_fish_tts("가을 나뭇잎이 부드럽게 떨어지는 생생한 색깔을 주목하지 않을 수 없었다", "preset_test.wav", mode="preset", check_duration=True) # siliconflow_fish_tts("使用客制化音色测试", "custom_test.wav", mode="custom", voice_id="speech:your-voice-name:cm04pf7az00061413w7kz5qxs:mjtkgbyuunvtybnsvbxd") # siliconflow_fish_tts("使用动态音色测试", "dynamic_test.wav", mode="dynamic", ref_audio="output/audio/refers/1.wav", ref_text="Okay folks, welcome back. This is price action model number four, position trading.") ================================================ FILE: core/tts_backend/tts_main.py ================================================ import os import re from pydub import AudioSegment from core.asr_backend.audio_preprocess import get_audio_duration from core.tts_backend.gpt_sovits_tts import gpt_sovits_tts_for_videolingo from core.tts_backend.sf_fishtts import siliconflow_fish_tts_for_videolingo from core.tts_backend.openai_tts import openai_tts from core.tts_backend.fish_tts import fish_tts from core.tts_backend.azure_tts import azure_tts from core.tts_backend.edge_tts import edge_tts from core.tts_backend.sf_cosyvoice2 import cosyvoice_tts_for_videolingo from core.tts_backend.custom_tts import custom_tts from core.prompts import get_correct_text_prompt from core.tts_backend._302_f5tts import f5_tts_for_videolingo from core.utils import * def clean_text_for_tts(text): """Remove problematic characters for TTS""" chars_to_remove = ['&', '®', '™', '©'] for char in chars_to_remove: text = text.replace(char, '') return text.strip() def tts_main(text, save_as, number, task_df): text = clean_text_for_tts(text) # Check if text is empty or single character, single character voiceovers are prone to bugs cleaned_text = re.sub(r'[^\w\s]', '', text).strip() if not cleaned_text or len(cleaned_text) <= 1: silence = AudioSegment.silent(duration=100) # 100ms = 0.1s silence.export(save_as, format="wav") rprint(f"Created silent audio for empty/single-char text: {save_as}") return # Skip if file exists if os.path.exists(save_as): return print(f"Generating <{text}...>") TTS_METHOD = load_key("tts_method") max_retries = 3 for attempt in range(max_retries): try: if attempt >= max_retries - 1: print("Asking GPT to correct text...") correct_text = ask_gpt(get_correct_text_prompt(text),resp_type="json", log_title='tts_correct_text') text = correct_text['text'] if TTS_METHOD == 'openai_tts': openai_tts(text, save_as) elif TTS_METHOD == 'gpt_sovits': gpt_sovits_tts_for_videolingo(text, save_as, number, task_df) elif TTS_METHOD == 'fish_tts': fish_tts(text, save_as) elif TTS_METHOD == 'azure_tts': azure_tts(text, save_as) elif TTS_METHOD == 'sf_fish_tts': siliconflow_fish_tts_for_videolingo(text, save_as, number, task_df) elif TTS_METHOD == 'edge_tts': edge_tts(text, save_as) elif TTS_METHOD == 'custom_tts': custom_tts(text, save_as) elif TTS_METHOD == 'sf_cosyvoice2': cosyvoice_tts_for_videolingo(text, save_as, number, task_df) elif TTS_METHOD == 'f5tts': f5_tts_for_videolingo(text, save_as, number, task_df) # Check generated audio duration duration = get_audio_duration(save_as) if duration > 0: break else: if os.path.exists(save_as): os.remove(save_as) if attempt == max_retries - 1: print(f"Warning: Generated audio duration is 0 for text: {text}") # Create silent audio file silence = AudioSegment.silent(duration=100) # 100ms silence silence.export(save_as, format="wav") return print(f"Attempt {attempt + 1} failed, retrying...") except Exception as e: if attempt == max_retries - 1: raise Exception(f"Failed to generate audio after {max_retries} attempts: {str(e)}") print(f"Attempt {attempt + 1} failed, retrying...") ================================================ FILE: core/utils/__init__.py ================================================ # use try-except to avoid error when installing try: from .ask_gpt import ask_gpt from .decorator import except_handler, check_file_exists from .config_utils import load_key, update_key, get_joiner from rich import print as rprint except ImportError: pass __all__ = ["ask_gpt", "except_handler", "check_file_exists", "load_key", "update_key", "rprint", "get_joiner"] ================================================ FILE: core/utils/ask_gpt.py ================================================ import os import json from threading import Lock import json_repair from openai import OpenAI from core.utils.config_utils import load_key from rich import print as rprint from core.utils.decorator import except_handler # ------------ # cache gpt response # ------------ LOCK = Lock() GPT_LOG_FOLDER = 'output/gpt_log' def _save_cache(model, prompt, resp_content, resp_type, resp, message=None, log_title="default"): with LOCK: logs = [] file = os.path.join(GPT_LOG_FOLDER, f"{log_title}.json") os.makedirs(os.path.dirname(file), exist_ok=True) if os.path.exists(file): with open(file, 'r', encoding='utf-8') as f: logs = json.load(f) logs.append({"model": model, "prompt": prompt, "resp_content": resp_content, "resp_type": resp_type, "resp": resp, "message": message}) with open(file, 'w', encoding='utf-8') as f: json.dump(logs, f, ensure_ascii=False, indent=4) def _load_cache(prompt, resp_type, log_title): with LOCK: file = os.path.join(GPT_LOG_FOLDER, f"{log_title}.json") if os.path.exists(file): with open(file, 'r', encoding='utf-8') as f: for item in json.load(f): if item["prompt"] == prompt and item["resp_type"] == resp_type: return item["resp"] return False # ------------ # ask gpt once # ------------ @except_handler("GPT request failed", retry=5) def ask_gpt(prompt, resp_type=None, valid_def=None, log_title="default"): if not load_key("api.key"): raise ValueError("API key is not set") # check cache cached = _load_cache(prompt, resp_type, log_title) if cached: rprint("use cache response") return cached model = load_key("api.model") base_url = load_key("api.base_url") if 'ark' in base_url: base_url = "https://ark.cn-beijing.volces.com/api/v3" # huoshan base url elif 'v1' not in base_url: base_url = base_url.strip('/') + '/v1' client = OpenAI(api_key=load_key("api.key"), base_url=base_url) response_format = {"type": "json_object"} if resp_type == "json" and load_key("api.llm_support_json") else None messages = [{"role": "user", "content": prompt}] params = dict( model=model, messages=messages, response_format=response_format, timeout=300 ) resp_raw = client.chat.completions.create(**params) # process and return full result resp_content = resp_raw.choices[0].message.content if resp_type == "json": resp = json_repair.loads(resp_content) else: resp = resp_content # check if the response format is valid if valid_def: valid_resp = valid_def(resp) if valid_resp['status'] != 'success': _save_cache(model, prompt, resp_content, resp_type, resp, log_title="error", message=valid_resp['message']) raise ValueError(f"❎ API response error: {valid_resp['message']}") _save_cache(model, prompt, resp_content, resp_type, resp, log_title=log_title) return resp if __name__ == '__main__': from rich import print as rprint result = ask_gpt("""test respond ```json\n{\"code\": 200, \"message\": \"success\"}\n```""", resp_type="json") rprint(f"Test json output result: {result}") ================================================ FILE: core/utils/config_utils.py ================================================ from ruamel.yaml import YAML import threading CONFIG_PATH = 'config.yaml' lock = threading.Lock() yaml = YAML() yaml.preserve_quotes = True # ----------------------- # load & update config # ----------------------- def load_key(key): with lock: with open(CONFIG_PATH, 'r', encoding='utf-8') as file: data = yaml.load(file) keys = key.split('.') value = data for k in keys: if isinstance(value, dict) and k in value: value = value[k] else: raise KeyError(f"Key '{k}' not found in configuration") return value def update_key(key, new_value): with lock: with open(CONFIG_PATH, 'r', encoding='utf-8') as file: data = yaml.load(file) keys = key.split('.') current = data for k in keys[:-1]: if isinstance(current, dict) and k in current: current = current[k] else: return False if isinstance(current, dict) and keys[-1] in current: current[keys[-1]] = new_value with open(CONFIG_PATH, 'w', encoding='utf-8') as file: yaml.dump(data, file) return True else: raise KeyError(f"Key '{keys[-1]}' not found in configuration") # basic utils def get_joiner(language): if language in load_key('language_split_with_space'): return " " elif language in load_key('language_split_without_space'): return "" else: raise ValueError(f"Unsupported language code: {language}") if __name__ == "__main__": print(load_key('language_split_with_space')) ================================================ FILE: core/utils/decorator.py ================================================ import functools import time import os from rich import print as rprint # ------------------------------ # retry decorator # ------------------------------ def except_handler(error_msg, retry=0, delay=1, default_return=None): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): last_exception = None for i in range(retry + 1): try: return func(*args, **kwargs) except Exception as e: last_exception = e rprint(f"[red]{error_msg}: {e}, retry: {i+1}/{retry}[/red]") if i == retry: if default_return is not None: return default_return raise last_exception time.sleep(delay * (2**i)) return wrapper return decorator # ------------------------------ # check file exists decorator # ------------------------------ def check_file_exists(file_path): def decorator(func): @functools.wraps(func) def wrapper(*args, **kwargs): if os.path.exists(file_path): rprint(f"[yellow]⚠️ File <{file_path}> already exists, skip <{func.__name__}> step.[/yellow]") return return func(*args, **kwargs) return wrapper return decorator if __name__ == "__main__": @except_handler("function execution failed", retry=3, delay=1) def test_function(): raise Exception("test exception") test_function() ================================================ FILE: core/utils/delete_retry_dubbing.py ================================================ import os import shutil def delete_dubbing_files(): files_to_delete = [ os.path.join("output", "dub.wav"), os.path.join("output", "output_dub.mp4") ] for file_path in files_to_delete: if os.path.exists(file_path): try: os.remove(file_path) print(f"Deleted: {file_path}") except Exception as e: print(f"Error deleting {file_path}: {str(e)}") else: print(f"File not found: {file_path}") segs_folder = os.path.join("output", "audio", "segs") if os.path.exists(segs_folder): try: shutil.rmtree(segs_folder) print(f"Deleted folder and contents: {segs_folder}") except Exception as e: print(f"Error deleting folder {segs_folder}: {str(e)}") else: print(f"Folder not found: {segs_folder}") if __name__ == "__main__": delete_dubbing_files() ================================================ FILE: core/utils/models.py ================================================ # ------------------------------------------ # 定义中间产出文件 # ------------------------------------------ _2_CLEANED_CHUNKS = "output/log/cleaned_chunks.xlsx" _3_1_SPLIT_BY_NLP = "output/log/split_by_nlp.txt" _3_2_SPLIT_BY_MEANING = "output/log/split_by_meaning.txt" _4_1_TERMINOLOGY = "output/log/terminology.json" _4_2_TRANSLATION = "output/log/translation_results.xlsx" _5_SPLIT_SUB = "output/log/translation_results_for_subtitles.xlsx" _5_REMERGED = "output/log/translation_results_remerged.xlsx" _8_1_AUDIO_TASK = "output/audio/tts_tasks.xlsx" # ------------------------------------------ # 定义音频文件 # ------------------------------------------ _OUTPUT_DIR = "output" _AUDIO_DIR = "output/audio" _RAW_AUDIO_FILE = "output/audio/raw.mp3" _VOCAL_AUDIO_FILE = "output/audio/vocal.mp3" _BACKGROUND_AUDIO_FILE = "output/audio/background.mp3" _AUDIO_REFERS_DIR = "output/audio/refers" _AUDIO_SEGS_DIR = "output/audio/segs" _AUDIO_TMP_DIR = "output/audio/tmp" # ------------------------------------------ # 导出 # ------------------------------------------ __all__ = [ "_2_CLEANED_CHUNKS", "_3_1_SPLIT_BY_NLP", "_3_2_SPLIT_BY_MEANING", "_4_1_TERMINOLOGY", "_4_2_TRANSLATION", "_5_SPLIT_SUB", "_5_REMERGED", "_8_1_AUDIO_TASK", "_OUTPUT_DIR", "_AUDIO_DIR", "_RAW_AUDIO_FILE", "_VOCAL_AUDIO_FILE", "_BACKGROUND_AUDIO_FILE", "_AUDIO_REFERS_DIR", "_AUDIO_SEGS_DIR", "_AUDIO_TMP_DIR" ] ================================================ FILE: core/utils/onekeycleanup.py ================================================ import os import glob from core._1_ytdlp import find_video_files import shutil def cleanup(history_dir="history"): # Get video file name video_file = find_video_files() video_name = video_file.split("/")[1] video_name = os.path.splitext(video_name)[0] video_name = sanitize_filename(video_name) # Create required folders os.makedirs(history_dir, exist_ok=True) video_history_dir = os.path.join(history_dir, video_name) log_dir = os.path.join(video_history_dir, "log") gpt_log_dir = os.path.join(video_history_dir, "gpt_log") os.makedirs(log_dir, exist_ok=True) os.makedirs(gpt_log_dir, exist_ok=True) # Move non-log files for file in glob.glob("output/*"): if not file.endswith(('log', 'gpt_log')): move_file(file, video_history_dir) # Move log files for file in glob.glob("output/log/*"): move_file(file, log_dir) # Move gpt_log files for file in glob.glob("output/gpt_log/*"): move_file(file, gpt_log_dir) # Delete empty output directories try: os.rmdir("output/log") os.rmdir("output/gpt_log") os.rmdir("output") except OSError: pass # Ignore errors when deleting directories def move_file(src, dst): try: # Get the source file name src_filename = os.path.basename(src) # Use os.path.join to ensure correct path and include file name dst = os.path.join(dst, sanitize_filename(src_filename)) if os.path.exists(dst): if os.path.isdir(dst): # If destination is a folder, try to delete its contents shutil.rmtree(dst, ignore_errors=True) else: # If destination is a file, try to delete it os.remove(dst) shutil.move(src, dst, copy_function=shutil.copy2) print(f"✅ Moved: {src} -> {dst}") except PermissionError: print(f"⚠️ Permission error: Cannot delete {dst}, attempting to overwrite") try: shutil.copy2(src, dst) os.remove(src) print(f"✅ Copied and deleted source file: {src} -> {dst}") except Exception as e: print(f"❌ Move failed: {src} -> {dst}") print(f"Error message: {str(e)}") except Exception as e: print(f"❌ Move failed: {src} -> {dst}") print(f"Error message: {str(e)}") def sanitize_filename(filename): # Remove or replace disallowed characters invalid_chars = '<>:"/\\|?*' for char in invalid_chars: filename = filename.replace(char, '_') return filename if __name__ == "__main__": cleanup() ================================================ FILE: core/utils/pypi_autochoose.py ================================================ import subprocess import time import requests import os import concurrent.futures from rich.console import Console from rich.table import Table from rich.progress import Progress, SpinnerColumn, TextColumn from rich.panel import Panel import sys MIRRORS = { "Tsinghua Mirror": "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple", "PyPI Official": "https://pypi.org/simple" } console = Console() FAST_THRESHOLD = 3000 # ms SLOW_THRESHOLD = 5000 # ms def get_optimal_thread_count(): try: cpu_count = os.cpu_count() return max(cpu_count - 1, 1) except: return 2 def test_mirror_speed(name, url): try: start_time = time.time() response = requests.get(url, timeout=5) end_time = time.time() if response.status_code == 200: speed = (end_time - start_time) * 1000 return name, speed else: return name, float('inf') except requests.RequestException: return name, float('inf') def set_pip_mirror(url): try: subprocess.run([sys.executable, "-m", "pip", "config", "set", "global.index-url", url], check=True, capture_output=True) return True except subprocess.CalledProcessError as e: print(f"Failed to set pip mirror: {e}") return False def get_current_pip_mirror(): try: result = subprocess.run([sys.executable, "-m", "pip", "config", "get", "global.index-url"], capture_output=True, text=True, check=True) return result.stdout.strip() except subprocess.CalledProcessError: return None def main(): console.print(Panel.fit("🚀 PyPI Mirror Speed Test", style="bold cyan")) # Test all mirrors simultaneously speeds = {} with Progress( SpinnerColumn(), TextColumn("[cyan]Testing mirrors...[/cyan]"), ) as progress: progress.add_task("", total=None) # Indeterminate spinner with concurrent.futures.ThreadPoolExecutor(max_workers=get_optimal_thread_count()) as executor: future_to_mirror = {executor.submit(test_mirror_speed, name, url): name for name, url in MIRRORS.items()} for future in concurrent.futures.as_completed(future_to_mirror): name = future_to_mirror[future] try: name, speed = future.result() if speed != float('inf'): speeds[name] = speed except Exception as exc: print(f'{name} generated an exception: {exc}') # Results display table = Table(show_header=False) table.add_column(style="cyan") table.add_column(justify="right", style="magenta") for name, speed in sorted(speeds.items(), key=lambda x: x[1]): table.add_row(name, f"{speed:.0f}ms") console.print(table) if speeds: fastest_mirror = min(speeds, key=speeds.get) fastest_url = MIRRORS[fastest_mirror] if set_pip_mirror(fastest_url): current_mirror = get_current_pip_mirror() if current_mirror == fastest_url: console.print(f"✅ Switched to {fastest_mirror}\n🔗 {fastest_url}", style="green") else: console.print(f"❌ Switch failed\nExpected: {fastest_url}\nCurrent: {current_mirror}\n💡 Try running with admin privileges", style="red") else: console.print(f"❌ Failed to switch mirror\n💡 Check permissions and try again", style="red") else: console.print("❌ All mirrors unreachable\n💡 Check network connection", style="red") if __name__ == "__main__": main() ================================================ FILE: docs/.gitignore ================================================ .DS_Store .next/ node_modules/ *.log dist/ .turbo/ out/ # Theme styles packages/nextra-theme-*/style.css # Stork related */**/public/*.st */**/public/*.toml .vercel .idea/ .eslintcache .env tsup.config.bundled* tsconfig.tsbuildinfo ================================================ FILE: docs/components/landing/comments.tsx ================================================ import { Card, CardContent } from '@/components/ui/card' type Comment = { content: string author: string title: string } type Props = { items: Comment[] title: string } export default function Comments({ items, title }: Props) { return (

{title}

{items && items.map((comment, index) => (

"{comment.content}"

{comment.author}, {comment.title}

))}
) } ================================================ FILE: docs/components/landing/faq.tsx ================================================ import { Accordion, AccordionContent, AccordionItem, AccordionTrigger } from '@/components/ui/accordion' export interface FAQItem { question: string answer: string } interface FAQProps { items: FAQItem[] title: string } export default function FAQ({ items, title }: FAQProps) { return (

{title}

{items && items.map((item, index) => ( {item.question} {item.answer} ))}
) } ================================================ FILE: docs/components/landing/features.tsx ================================================ import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card' import { CheckCircle, ArrowRight } from 'lucide-react' // 定义Feature类型 type Feature = { title: string description: string icon: 'CheckCircle' | 'ArrowRight' } // 定义组件props类型 type FeaturesProps = { items: Feature[] title: string } // 修改组件为接受props的形式 export default function Features({ items, title }: FeaturesProps) { // 创建图标映射 const iconMap = { CheckCircle: , ArrowRight: , } return (

{title}

{items && items.map((feature, index) => (
{iconMap[feature.icon]} {feature.title}

{feature.description}

))}
) } ================================================ FILE: docs/components/landing/github-stats.tsx ================================================ import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip' import { Button } from '@/components/ui/button' import Link from 'next/link' export default function GitHubStats({ stars, recentStargazers }) { return (

GitHub

{stars} Stars
{recentStargazers && recentStargazers.length > 0 ? recentStargazers.map((user, index) => ( {user.login}

{user.login}

)) : ''}
) } ================================================ FILE: docs/components/landing/hero.tsx ================================================ import Link from 'next/link' import { Button } from '@/components/ui/button' import HeroVideoDialog from "@/components/ui/hero-video-dialog"; import Script from 'next/script' interface HeroProps { title: string description: string videoSrc: string } export default function Hero({ title, description, videoSrc }: HeroProps) { return (

{title}

{description}

{/* 视频演示组件 */}
) } ================================================ FILE: docs/pages/_meta.en-US.json ================================================ { "index": { "type": "page", "title": "VideoLingo", "display": "hidden", "theme": { "layout": "raw" } }, "docs": { "type": "page", "title": "Documents" } } ================================================ FILE: docs/pages/_meta.ja.json ================================================ { "index": { "type": "page", "title": "VideoLingo", "display": "hidden", "theme": { "layout": "raw" } }, "docs": { "type": "page", "title": "ドキュメント" } } ================================================ FILE: docs/pages/_meta.zh-CN.json ================================================ { "index": { "type": "page", "title": "VideoLingo", "display": "hidden", "theme": { "layout": "raw" } }, "docs": { "type": "page", "title": "文档" } } ================================================ FILE: docs/pages/docs/_meta.en-US.json ================================================ { "introduction": { "title": "Introduction" }, "start": { "title": "Start" }, "tech": { "title": "Tech" }, "docker": { "title": "Docker Installation" } } ================================================ FILE: docs/pages/docs/_meta.ja.json ================================================ { "introduction": { "title": "紹介" }, "start": { "title": "スタート" }, "tech": { "title": "技術" } } ================================================ FILE: docs/pages/docs/_meta.zh-CN.json ================================================ { "introduction": { "title": "介绍" }, "start": { "title": "使用文档" }, "tech": { "title": "技术文档" }, "docker": { "title": "Docker 文档" } } ================================================ FILE: docs/pages/docs/docker.en-US.md ================================================ # Docker Installation VideoLingo provides a Dockerfile that you can use to build the current VideoLingo package. Here are detailed instructions for building and running: ## System Requirements - CUDA version > 12.4 - NVIDIA Driver version > 550 ## Building and Running the Docker Image or Pulling from DockerHub ```bash # Build the Docker image docker build -t videolingo . # Run the Docker container docker run -d -p 8501:8501 --gpus all videolingo ``` ### Pulling from DockerHub You can directly pull the pre-built VideoLingo image from DockerHub: ```bash docker pull rqlove/videolingo:latest ``` After pulling, use the following command to run the container: ```bash docker run -d -p 8501:8501 --gpus all rqlove/videolingo:latest ``` Note: - The `-d` parameter runs the container in the background - `-p 8501:8501` maps port 8501 of the container to port 8501 of the host - `--gpus all` enables support for all available GPUs - Make sure to use the full image name `rqlove/videolingo:latest` ## Models The Whisper model is not included in the image and will be automatically downloaded when the container is first run. If you want to skip the automatic download process, you can download the model weights from [here](https://drive.google.com/file/d/10gPu6qqv92WbmIMo1iJCqQxhbd1ctyVw/view?usp=drive_link) or [Baidu Netdisk](https://pan.baidu.com/s/1hZjqSGVn3z_WSg41-6hCqA?pwd=2kgs) (Passcode: 2kgs). After downloading, use the following command to run the container, mounting the model file into the container: ```bash docker run -d -p 8501:8501 --gpus all -v /path/to/your/model:/app/_model_cache rqlove/videolingo:latest ``` Please replace `/path/to/your/model` with the actual local path where you downloaded the model file. ## Additional Information - Base image: nvidia/cuda:12.4.1-devel-ubuntu20.04 - Python version: 3.10 - Pre-installed software: git, curl, sudo, ffmpeg, fonts-noto, etc. - PyTorch version: 2.8.0 (CUDA 12.x build target) > ⚠️ The install script uses `nvidia-smi` to detect the driver's CUDA version and selects the best wheel (cu129 for RTX 50 series / Blackwell GPUs, cu128 or cu126 for older GPUs). We use cu12x wheels instead of cu130/cu131 because ctranslate2 (a core whisperX dependency) is only compiled for CUDA 12 and requires `cublas64_12.dll`, which only cu12x wheels ship. NVIDIA drivers are backward-compatible, so a CUDA 13.x host runs cu12x wheels perfectly. - Exposed port: 8501 (Streamlit application) For more detailed information, please refer to the Dockerfile. ## Future Plans - Continue to improve the Dockerfile to reduce image size - Push the Docker image to Docker Hub - Support mounting required models to the host machine using the -v parameter ================================================ FILE: docs/pages/docs/docker.zh-CN.md ================================================ # Docker安装 VideoLingo 提供了Dockerfile,可自行使用Dockerfile打包目前VideoLingo。以下是打包和运行的详细说明: ## 系统要求 - CUDA版本 > 12.4 - NVIDIA Driver版本> 550 ## 构建和运行Docker镜像或者从DokerHub拉取 ```bash # 构建Docker镜像 docker build -t videolingo . # 运行Docker容器 docker run -d -p 8501:8501 --gpus all videolingo ``` ### 从DockerHub拉取 您可以直接从DockerHub拉取预构建的VideoLingo镜像: ```bash docker pull rqlove/videolingo:latest ``` 拉取完成后,使用以下命令运行容器: ```bash docker run -d -p 8501:8501 --gpus all rqlove/videolingo:latest ``` 注意: - `-d` 参数使容器在后台运行 - `-p 8501:8501` 将容器的8501端口映射到主机的8501端口 - `--gpus all` 启用所有可用的GPU支持 - 确保使用完整的镜像名称 `rqlove/videolingo:latest` ## 模型 whisper 模型不包含在镜像中,会在容器首次运行时自动下载。如果您希望跳过自动下载过程,可以从以下链接下载模型权重: - [Google Drive链接](https://drive.google.com/file/d/10gPu6qqv92WbmIMo1iJCqQxhbd1ctyVw/view?usp=drive_link) - [百度网盘链接](https://pan.baidu.com/s/1hZjqSGVn3z_WSg41-6hCqA?pwd=2kgs) 下载后,使用以下命令运行容器,将模型文件挂载到容器中: ```bash docker run -d -p 8501:8501 --gpus all -v /path/to/your/model:/app/_model_cache rqlove/videolingo:latest ``` 请注意将 `/path/to/your/model` 替换为您实际下载模型文件的本地路径。 ## 其他说明 - 基础镜像: nvidia/cuda:12.4.1-devel-ubuntu20.04 - Python版本: 3.10 - 预装软件: git, curl, sudo, ffmpeg, fonts-noto等 - PyTorch版本: 2.8.0 (CUDA 12.x 编译目标) > ⚠️ 安装脚本通过 `nvidia-smi` 检测驱动的 CUDA 版本,自动选择最佳轮子(RTX 50 系列 / Blackwell 显卡使用 cu129,旧显卡使用 cu128 或 cu126)。使用 cu12x 而非 cu130/cu131 的原因是 ctranslate2(whisperX 的核心依赖)仅为 CUDA 12 编译,需要 `cublas64_12.dll`,只有 cu12x 轮子才包含此文件。NVIDIA 驱动向后兼容,因此 CUDA 13.x 宿主机可以正常运行 cu12x 轮子。 - 暴露端口: 8501 (Streamlit应用) 如需更多详细信息,请参考Dockerfile。 ================================================ FILE: docs/pages/docs/introduction.en-US.md ================================================ # VideoLingo: Connecting the World, Frame by Frame ## 🌟 Overview ([Try VideoLingo Now!](https://videolingo.io)) VideoLingo is an all-in-one video translation, localization, and dubbing tool aimed at generating Netflix-quality subtitles. It eliminates stiff machine translations and multi-line subtitles while adding high-quality dubbing, enabling global knowledge sharing across language barriers. Key features: - 🎥 YouTube video download via yt-dlp - **🎙️ Word-level subtitle recognition with WhisperX** - **📝 NLP and GPT-based subtitle segmentation** - **📚 GPT-generated terminology for coherent translation** - **🔄 3-step direct translation, reflection, and adaptation for professional-level quality** - **✅ Netflix-standard single-line subtitles only** - **🗣️ Dubbing alignment with GPT-SoVITS and other methods** - 🚀 One-click startup and output in Streamlit - 📝 Detailed logging with progress resumption Difference from similar projects: **Single-line subtitles only, superior translation quality, seamless dubbing experience** ## 🎥 Demo
### Russian Translation --- https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7 ### GPT-SoVITS Dubbing --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Language Support **Input Language Support(more to come):** 🇺🇸 English 🤩 | 🇷🇺 Russian 😊 | 🇫🇷 French 🤩 | 🇩🇪 German 🤩 | 🇮🇹 Italian 🤩 | 🇪🇸 Spanish 🤩 | 🇯🇵 Japanese 😐 | 🇨🇳 Chinese* 😊 > *Chinese uses a separate punctuation-enhanced whisper model, for now... **Translation supports all languages, while dubbing language depends on the chosen TTS method.** ## Installation > **Note:** To use NVIDIA GPU acceleration on Windows, please complete the following steps first: > 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) or newer (12.8 / 13.x all work — the install script auto-adapts) > 2. Install [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to your system PATH > 4. Restart your computer > **Note:** For Windows and macOS users, it's recommended to install FFmpeg via package managers (Chocolatey/Homebrew): > ```choco install ffmpeg``` (Windows) or ```brew install ffmpeg``` (macOS). > ⚠️ Do NOT use conda-forge ffmpeg (lacks libmp3lame encoder). Use the system package manager to install a full build. 1. Clone the repository ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Install dependencies(requires `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. Start the application ```bash streamlit run st.py ``` ### Docker Alternatively, you can use Docker (requires CUDA 12.4 and NVIDIA Driver version >550), see [Docker docs](/docs/pages/docs/docker.en-US.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## API The project supports OpenAI-Like API format and various dubbing interfaces: - `claude-sonnet-4-6`, `gpt-5.2`, `gemini-3-flash`, `deepseek-v3`, `minimax-m2.5`, `kimi-k2.5`, ... (sorted by performance) - `azure-tts`, `openai-tts`, `siliconflow-fishtts`, `fish-tts`, `GPT-SoVITS` For detailed installation, API configuration, and batch mode instructions, please refer to the documentation: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## Current Limitations 1. WhisperX transcription performance may be affected by video background noise, as it uses wav2vac model for alignment. For videos with loud background music, please enable Voice Separation Enhancement. Additionally, subtitles ending with numbers or special characters may be truncated early due to wav2vac's inability to map numeric characters (e.g., "1") to their spoken form ("one"). 2. Using weaker models can lead to errors during intermediate processes due to strict JSON format requirements for responses. If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error. 3. The dubbing feature may not be 100% perfect due to differences in speech rates and intonation between languages, as well as the impact of the translation step. However, this project has implemented extensive engineering processing for speech rates to ensure the best possible dubbing results. 4. **Multilingual video transcription recognition will only retain the main language**. This is because whisperX uses a specialized model for a single language when forcibly aligning word-level subtitles, and will delete unrecognized languages. 5. **Cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable. ## 📄 License This project is licensed under the Apache 2.0 License. Special thanks to the following open source projects for their contributions: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 Contact Us - Join our Discord: https://discord.gg/9F2G92CWPp - Submit [Issues](https://github.com/Huanshere/VideoLingo/issues) or [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) on GitHub - Follow me on Twitter: [@Huanshere](https://twitter.com/Huanshere) - Email me at: team@videolingo.io ## ⭐ Star History [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

If you find VideoLingo helpful, please give us a ⭐️!

================================================ FILE: docs/pages/docs/introduction.zh-CN.md ================================================ # VideoLingo: 连接世界的每一帧 **QQ 群:875297969** ## 🌟 简介([在线体验!](https://videolingo.io)) VideoLingo 是一站式视频翻译本地化配音工具,能够一键生成 Netflix 级别的高质量字幕,告别生硬机翻,告别多行字幕,还能加上高质量的克隆配音,让全世界的知识能够跨越语言的障碍共享。 主要特点和功能: - 🎥 使用 yt-dlp 从 Youtube 链接下载视频 - **🎙️ 使用 WhisperX 进行单词级时间轴字幕识别** - **📝 使用 NLP 和 GPT 根据句意进行字幕分割** - **📚 GPT 总结提取术语知识库,上下文连贯翻译** - **🔄 三步直译、反思、意译,媲美字幕组精翻效果** - **✅ 按照 Netflix 标准检查单行长度,绝无双行字幕** - **🗣️ 使用 GPT-SoVITS 等方法对齐克隆配音** - 🚀 整合包一键启动,在 streamlit 中一键出片 - 📝 详细记录每步操作日志,支持随时中断和恢复进度 与同类项目相比的优势:**绝无多行字幕,最佳的翻译质量,无缝的配音体验** ## 🎥 效果演示
### 俄语翻译 --- https://github.com/user-attachments/assets/25264b5b-6931-4d39-948c-5a1e4ce42fa7 ### GPT-SoVITS配音 --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### 语言支持 **输入语言支持:** 🇺🇸 英语 🤩 | 🇷🇺 俄语 😊 | 🇫🇷 法语 🤩 | 🇩🇪 德语 🤩 | 🇮🇹 意大利语 🤩 | 🇪🇸 西班牙语 🤩 | 🇯🇵 日语 😐 | 🇨🇳 中文* 😊 > *中文使用单独的标点增强后的 whisper 模型 **翻译语言支持所有语言,配音语言取决于选取的TTS。** ## 安装 > **注意:** 在 Windows 上使用 NVIDIA GPU 加速需要先完成以下步骤: > 1. 安装 [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) 或更高版本(12.8 / 13.x 均可,安装脚本会自动适配) > 2. 安装 [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. 将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加到系统环境变量 PATH 中 > 4. 重启电脑 > **注意:** Windows 和 macOS 用户建议通过包管理器(Chocolatey/Homebrew)安装 FFmpeg: > ```choco install ffmpeg```(Windows)或 ```brew install ffmpeg```(macOS)。 > ⚠️ 不要使用 conda-forge 的 ffmpeg(缺少 libmp3lame 编码器),建议用系统包管理器安装完整版。 1. 克隆仓库 ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. 安装依赖(需要 `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. 启动应用 ```bash streamlit run st.py ``` ### Docker 还可以选择使用 Docker(要求 CUDA 12.4 和 NVIDIA Driver 版本 >550),详见[Docker文档](/docs/pages/docs/docker.zh-CN.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## API 本项目支持 OpenAI-Like 格式的 api 和多种配音接口: - `claude-sonnet-4-6`, `gpt-5.2`, `gemini-3-flash`, `deepseek-v3`, `minimax-m2.5`, `kimi-k2.5`, ...(按效果排序) - `azure-tts`, `openai-tts`, `siliconflow-fishtts`, `fish-tts`, `GPT-SoVITS` 详细的安装、 API 配置、汉化、批量说明可以参见文档:[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md) ## 当前限制 1. WhisperX 转录效果可能受到视频背景声影响,因为使用了 wav2vac 模型进行对齐。对于背景音乐较大的视频,请开启人声分离增强。另外,如果字幕以数字或特殊符号结尾,可能会导致提前截断,这是因为 wav2vac 无法将数字字符(如"1")映射到其发音形式("one")。 2. 使用较弱模型时容易在中间过程报错,这是因为对响应的 json 格式要求较为严格。如果出现此错误,请删除 `output` 文件夹后更换 llm 重试,否则重复执行会读取上次错误的响应导致同样错误。 3. 配音功能由于不同语言的语速和语调差异,还受到翻译步骤的影响,可能不能 100% 完美,但本项目做了非常多的语速上的工程处理,尽可能保证配音效果。 4. **多语言视频转录识别仅仅只会保留主要语言**,这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型,会因为不认识另一种语言而删去。 5. **无法多角色分别配音**,whisperX 的说话人区分效果不够好用。 ## 📄 许可证 本项目采用 Apache 2.0 许可证,衷心感谢以下开源项目的贡献: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 联系我们 - 加入我们的 QQ 群寻求解答:875297969 - 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) - 关注我的 Twitter:[@Huanshere](https://twitter.com/Huanshere) - 联系邮箱:team@videolingo.io ## ⭐ Star History [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ================================================ FILE: docs/pages/docs/start.en-US.md ================================================ # 🚀 Getting Started ## 📋 API Configuration VideoLingo requires an LLM and TTS(optional). For the best quality, use claude-3-5-sonnet-20240620 with Azure TTS. Alternatively, for a fully local setup with no API key needed, use Ollama for the LLM and Edge TTS for dubbing. In this case, set `max_workers` to 1 and `summary_length` to a low value like 2000 in `config.yaml`. ### 1. **Get API_KEY for LLM**: | Recommended Model | Vendor | Quality | Cost-efficiency | |:-----|:---------|:-----|:---------| | claude-sonnet-4-6 | [Anthropic](https://www.anthropic.com) | 🤩 | ⭐⭐⭐ | | claude-opus-4-6 | [Anthropic](https://www.anthropic.com) | 🏆 | ⭐⭐ | | gpt-5.2 | [OpenAI](https://openai.com) | 🤩 | ⭐⭐⭐ | | gemini-3-flash | [Google](https://ai.google.dev) | 😃 | ⭐⭐⭐⭐⭐ | | gemini-3.1-pro | [Google](https://ai.google.dev) | 🤩 | ⭐⭐⭐ | | minimax-m2.5 | [MiniMax](https://www.minimax.io) | 😃 | ⭐⭐⭐⭐⭐ | | kimi-k2.5 | [Moonshot AI](https://www.moonshot.cn) | 😃 | ⭐⭐⭐⭐ | | deepseek-v3 | [DeepSeek](https://www.deepseek.com) | 🥳 | ⭐⭐⭐⭐ | | qwen3-32b | [Ollama](https://ollama.ai) self-hosted | 😃 | ♾️ Free | > **Tip:** Model pricing changes frequently. Check each vendor's website for current rates. [models.dev](https://models.dev) offers cross-vendor price and capability comparison. > > **API proxy:** If you cannot access overseas APIs directly, [OpenRouter](https://openrouter.ai) is recommended (supports all models above, unified OpenAI-format API, pay-per-use with no monthly fee). Note: Supports OpenAI format, you can try different models at your risk. However, the process involves multi-step reasoning chains and complex JSON formats, **not recommended to use models smaller than 30B**. ### 2. **TTS API** VideoLingo provides multiple TTS integration methods. Here's a comparison (skip if only using translation without dubbing) | TTS Solution | Provider | Pros | Cons | Chinese Effect | Non-Chinese Effect | |:---------|:---------|:-----|:-----|:---------|:-----------| | 🔊 Azure TTS ⭐ | [302AI](https://gpt302.saaslink.net/C2oHR9) | Natural effect | Limited emotions | 🤩 | 😃 | | 🎙️ OpenAI TTS | [302AI](https://gpt302.saaslink.net/C2oHR9) | Realistic emotions | Chinese sounds foreign | 😕 | 🤩 | | 🎤 Fish TTS | [302AI](https://gpt302.saaslink.net/C2oHR9) | Authentic native | Limited official models | 🤩 | 😂 | | 🎙️ SiliconFlow FishTTS | [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE) | Voice Clone | Unstable cloning effect | 😃 | 😃 | | 🗣 Edge TTS | Local | Completely free | Average effect | 😐 | 😐 | | 🗣️ GPT-SoVITS | Local | Best voice cloning | Only supports Chinese/English, requires local inference, complex setup | 🏆 | 🚫 | - For SiliconFlow FishTTS, get key from [SiliconFlow](https://cloud.siliconflow.cn/i/ttKDEsxE), note that cloning feature requires paid credits; - For OpenAI TTS, Azure TTS, and Fish TTS, use [302AI](https://gpt302.saaslink.net/C2oHR9) - one API key provides access to all three services > Wanna use your own TTS? Modify in `core/all_tts_functions/custom_tts.py`!
SiliconFlow FishTTS Tutorial Currently supports 3 modes: 1. `preset`: Uses fixed voice, can preview on [Official Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608), default is `anna`. 2. `clone(stable)`: Corresponds to fishtts api's `custom`, uses voice from uploaded audio, automatically samples first 10 seconds of video for voice, better voice consistency. 3. `clone(dynamic)`: Corresponds to fishtts api's `dynamic`, uses each sentence as reference audio during TTS, may have inconsistent voice but better effect.
How to choose OpenAI voices? Voice list can be found on the [official website](https://platform.openai.com/docs/guides/text-to-speech/voice-options), such as `alloy`, `echo`, `nova`, etc. Modify `openai_tts.voice` in `config.yaml`.
How to choose Azure voices? Recommended to try voices in the [online demo](https://speech.microsoft.com/portal/voicegallery). You can find the voice code in the code on the right, e.g. `zh-CN-XiaoxiaoMultilingualNeural`
How to choose Fish TTS voices? Go to the [official website](https://fish.audio/en/) to listen and choose voices. Find the voice code in the URL, e.g. Dingzhen is `54a5170264694bfc8e9ad98df7bd89c3`. Popular voices are already added in `config.yaml`. To use other voices, modify the `fish_tts.character_id_dict` dictionary in `config.yaml`.
GPT-SoVITS-v2 Tutorial 1. Check requirements and download the package from [official Yuque docs](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO). 2. Place `GPT-SoVITS-v2-xxx` and `VideoLingo` in the same directory. **Note they should be parallel folders.** 3. Choose one of the following ways to configure the model: a. Self-trained model: - After training, `tts_infer.yaml` under `GPT-SoVITS-v2-xxx\GPT_SoVITS\configs` will have your model path auto-filled. Copy and rename it to `your_preferred_english_character_name.yaml` - In the same directory as the `yaml` file, place reference audio named `your_preferred_english_character_name_reference_audio_text.wav` or `.mp3`, e.g. `Huanyuv2_Hello, this is a test audio.wav` - In VideoLingo's sidebar, set `GPT-SoVITS Character` to `your_preferred_english_character_name`. b. Use pre-trained model: - Download my model from [here](https://vip.123pan.cn/1817874751/8137723), extract and overwrite to `GPT-SoVITS-v2-xxx`. - Set `GPT-SoVITS Character` to `Huanyuv2`. c. Use other trained models: - Place `xxx.ckpt` in `GPT_weights_v2` folder and `xxx.pth` in `SoVITS_weights_v2` folder. - Following method a, rename `tts_infer.yaml` and modify `t2s_weights_path` and `vits_weights_path` under `custom` to point to your models, e.g.: ```yaml # Example config for method b: t2s_weights_path: GPT_weights_v2/Huanyu_v2-e10.ckpt version: v2 vits_weights_path: SoVITS_weights_v2/Huanyu_v2_e10_s150.pth ``` - Following method a, place reference audio in the same directory as the `yaml` file, named `your_preferred_english_character_name_reference_audio_text.wav` or `.mp3`, e.g. `Huanyuv2_Hello, this is a test audio.wav`. The program will auto-detect and use it. - ⚠️ Warning: **Please use English for `character_name`** to avoid errors. `reference_audio_text` can be in Chinese. Currently in beta, may produce errors. ``` # Expected directory structure: . ├── VideoLingo │ └── ... └── GPT-SoVITS-v2-xxx ├── GPT_SoVITS │ └── configs │ ├── tts_infer.yaml │ ├── your_preferred_english_character_name.yaml │ └── your_preferred_english_character_name_reference_audio_text.wav ├── GPT_weights_v2 │ └── [your GPT model file] └── SoVITS_weights_v2 └── [your SoVITS model file] ``` After configuration, select `Reference Audio Mode` in the sidebar (see Yuque docs for details). During dubbing, VideoLingo will automatically open GPT-SoVITS inference API port in the command line, which can be closed manually after completion. Note that stability depends on the base model chosen.
## 🛠️ Quick Start VideoLingo supports Windows, macOS and Linux systems, and can run on CPU or GPU. > **Note:** To use NVIDIA GPU acceleration on Windows, please complete the following steps first: > 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) or newer (12.8 / 12.9 / 13.x all work — the install script auto-adapts) > 2. Install [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to your system PATH > 4. Restart your computer > > ⚠️ **Pitfall:** The install script uses `nvidia-smi` to detect your driver's CUDA version and auto-selects the best PyTorch wheel (cu129 / cu128 / cu126). For RTX 50 series (Blackwell) GPUs, cu129 wheels with sm_100 kernels are selected automatically. **Do NOT manually install cu130/cu131 PyTorch** — this causes ctranslate2 to fail with `cublas64_12.dll not found`. > **Note:** FFmpeg is required. Please install it via package managers: > - Windows: ```choco install ffmpeg``` (via [Chocolatey](https://chocolatey.org/)) > - macOS: ```brew install ffmpeg``` (via [Homebrew](https://brew.sh/)) > - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu) or ```sudo dnf install ffmpeg``` (Fedora) > > ⚠️ **Pitfall:** Do NOT use conda-forge ffmpeg (it lacks the libmp3lame encoder). Use the system package manager to install a full build. Before installing VideoLingo, ensure you have installed Git and Anaconda. 1. Clone the project: ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Create and activate virtual environment (**must be python=3.10.0**): ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo ``` > ⚠️ **Pitfall:** Make sure pip is using the conda env's site-packages. On Windows, if the `site-packages` directory is not writable (e.g. under `C:\ProgramData\anaconda3\`), pip silently installs to the user directory instead. If this happens, run the terminal as administrator. 3. Run installation script: ```bash python install.py ``` > ⚠️ **Install order matters:** `install.py` installs dependencies in the correct order: PyTorch first (locks CUDA version), then demucs with `--no-deps` (prevents torchaudio downgrade), then the rest. **Do not rearrange manually.** 4. 🎉 Launch Streamlit app by running the command or double-clicking `OneKeyStart.bat`: ```bash streamlit run st.py ``` 5. Set key in sidebar of popup webpage and start using~ ![tutorial](./en_page.png) 6. (Optional) More settings can be manually modified in `config.yaml`, watch command line output during operation. To use custom terms, add them to `custom_terms.xlsx` before processing, e.g. `Baguette | French bread | Not just any bread!`. > Need help? Our [AI Assistant](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) is here to guide you through any issues! ## 🏭 Batch Mode (beta) Document: [English](/batch/README.md) | [Chinese](/batch/README.zh.md) Note: This section is still in early development and may have limited functionality ## 🚨 Common Errors & Pitfalls 1. **'All array must be of the same length' or 'Key Error' during translation**: - Reason 1: Weaker models have poor JSON format compliance causing response parsing errors. - Reason 2: LLM may refuse to translate sensitive content. Solution: Check `response` and `msg` fields in `output/gpt_log/error.json`, delete the `output/gpt_log` folder and retry. 2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: Usually network issues. Solution: Users in mainland China please switch network nodes and retry. 3. **local_files_only=True**: Model download failure due to network issues, need to verify network can ping `huggingface.co`. 4. **`cublas64_12.dll not found`**: Installed CUDA 13.x and used cu130/cu131 PyTorch wheels. **Solution:** Must use cu129, cu128, or cu126 wheels (`install.py` handles this automatically via `nvidia-smi` detection) because ctranslate2 only supports CUDA 12. Re-run `python install.py`. 5. **Whisper model loading segfaults silently**: ctranslate2 version mismatches cuDNN version. **Solution:** Ensure `ctranslate2>=4.5.0` (supports cuDNN 9, which PyTorch 2.6+ ships with). 6. **`RuntimeError: Weights only load failed`**: PyTorch ≥2.6 changed `torch.load` default behavior. **Solution:** Already fixed via monkey-patch in `whisperX_local.py`. If you see this, your code is not up to date. 7. **WhisperX transcription hangs in Streamlit (CPU/GPU idle)**: `librosa.load()` deadlocks in Streamlit's non-main thread. **Solution:** Already fixed by replacing with `whisperx.audio.load_audio()` (ffmpeg subprocess). If you see this, your code is not up to date. 8. **spacy `Can't find model 'xx_core_web_md'` (but pip says installed)**: pip installed the model to user directory instead of conda env. **Solution:** Run terminal as administrator, or manually install with conda env's python: ```bash python -m pip install xx-core-web-md --no-user --force-reinstall --no-deps ``` 9. **torchaudio version drops to 1.x or 2.1.x after pip install**: demucs's `torchaudio<2.2` constraint causes downgrade. **Solution:** Never `pip install demucs` directly — must use `--no-deps`. `install.py` handles this correctly. ================================================ FILE: docs/pages/docs/start.zh-CN.md ================================================ # 🚀 开始使用 ## 📋 API 配置指南 本项目需使用大模型和 TTS。追求最佳质量请使用 claude-3-5-sonnet-20240620 与 Azure TTS。也可以选择完全本地化体验,使用 Ollama 作为大模型,Edge TTS 作为配音,无需任何 API key(此时需要在 `config.yaml` 中将 `max_workers` 设为 1,`summary_length` 调低至 2000)。 ### 1. **大模型的 API_KEY**: | 推荐模型 | 厂商 | 效果 | 性价比 | |:-----|:---------|:-----|:---------| | claude-sonnet-4-6 | [Anthropic](https://www.anthropic.com) | 🤩 | ⭐⭐⭐ | | claude-opus-4-6 | [Anthropic](https://www.anthropic.com) | 🏆 | ⭐⭐ | | gpt-5.2 | [OpenAI](https://openai.com) | 🤩 | ⭐⭐⭐ | | gemini-3-flash | [Google](https://ai.google.dev) | 😃 | ⭐⭐⭐⭐⭐ | | gemini-3.1-pro | [Google](https://ai.google.dev) | 🤩 | ⭐⭐⭐ | | minimax-m2.5 | [MiniMax](https://www.minimax.io) | 😃 | ⭐⭐⭐⭐⭐ | | kimi-k2.5 | [Moonshot AI](https://www.moonshot.cn) | 😃 | ⭐⭐⭐⭐ | | deepseek-v3 | [DeepSeek](https://www.deepseek.com) | 🥳 | ⭐⭐⭐⭐ | | qwen3-32b | [Ollama](https://ollama.ai) 本地部署 | 😃 | ♾️ 免费 | > **提示:** 模型价格变动频繁,请前往各厂商官网查看最新定价。[models.dev](https://models.dev) 可横向比较各家模型的价格和能力。 > > **API 中转推荐:** 如果无法直接访问海外 API,推荐使用 [OpenRouter](https://openrouter.ai)(支持上述所有海外模型,统一 OpenAI 格式接口,按量付费无月费)。 注:支持 OpenAI 格式接口,可自行尝试不同模型。但处理过程涉及多步思维链和复杂的json格式,**不建议使用小于 30B 的模型**。 ### 2. **TTS 的 API** VideoLingo提供了多种 tts 接入方式,以下是对比(如不使用配音可跳过) | TTS 方案 | 提供商 | 优点 | 缺点 | 中文效果 | 非中文效果 | |:---------|:---------|:-----|:-----|:---------|:-----------| | 🔊 Azure TTS ⭐ | [302AI](https://gpt302.saaslink.net/C2oHR9) | 效果自然 | 情感不够丰富 | 🤩 | 😃 | | 🎙️ OpenAI TTS | [302AI](https://gpt302.saaslink.net/C2oHR9) | 情感真实 | 中文听起来像外国人 | 😕 | 🤩 | | 🎤 Fish TTS | [302AI](https://gpt302.saaslink.net/C2oHR9) | 真是本地人 | 官方模型有限 | 🤩 | 😂 | | 🎙️ SiliconFlow FishTTS | [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) | 语音克隆 | 克隆效果不稳定 | 😃 | 😃 | | 🗣 Edge TTS | 本地 | 完全免费 | 效果一般 | 😐 | 😐 | | 🗣️ GPT-SoVITS | 本地 | 最强语音克隆 | 只支持中英文,需要本地训练推理,配置麻烦 | 🏆 | 🚫 | - SiliconFlow FishTTS 请在 [硅基流动](https://cloud.siliconflow.cn/i/ttKDEsxE) 获取key,注意克隆功能需要付费充值积分; - OpenAI TTS、Azure TTS 和 Fish TTS,仅支持 [302AI](https://gpt302.saaslink.net/C2oHR9) - 一个 API key 即可使用所有服务 > 现在还可以在 `core/all_tts_functions/custom_tts.py` 里自定义tts渠道!
SiliconFlow FishTTS 使用教程 目前支持 3 种模式: 1. `preset`: 使用固定音色,可以在 [官网Playground](https://cloud.siliconflow.cn/playground/text-to-speech/17885302608) 试听,默认 `anna`。 2. `clone(stable)`: 对应 fishtts api 的 `custom`,使用一段上传音频的音色,会自动采集视频前十秒声音作为音色使用,音色一致性更好。 3. `clone(dynamic)`: 对应 fishtts api 的 `dynamic`,在 tts 过程使用每一句作为参考音频,可能出现音色不一致,但效果更好。
OpenAI 声音怎么选? 声音列表可以在 [官网](https://platform.openai.com/docs/guides/text-to-speech/voice-options) 找到,例如 `alloy`, `echo`, `nova`等,在 `config.yaml` 中修改 `openai_tts.voice` 即可。
Azure 声音怎么选? 建议在 [在线体验](https://speech.microsoft.com/portal/voicegallery) 中试听选择你想要的声音,在右边的代码中可以找到该声音对应的代号,例如 `zh-CN-XiaoxiaoMultilingualNeural`
Fish TTS 声音怎么选? 前往 [官网](https://fish.audio/zh-CN/) 中试听选择你想要的声音,在 URL 中可以找到该声音对应的代号,例如丁真是 `54a5170264694bfc8e9ad98df7bd89c3`,热门的几种声音已添加在 `config.yaml` 中。如需使用其他声音,请在 `config.yaml` 中修改 `fish_tts.character_id_dict` 字典。
GPT-SoVITS-v2 使用教程 1. 前往 [官方的语雀文档](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO) 查看配置要求并下载整合包。 2. 将 `GPT-SoVITS-v2-xxx` 与 `VideoLingo` 放在同一个目录下。**注意是两文件夹并列。** 3. 选择以下任一方式配置模型: a. 自训练模型: - 训练好模型后, `GPT-SoVITS-v2-xxx\GPT_SoVITS\configs` 下的 `tts_infer.yaml` 已自动填写好你的模型地址,将其复制并重命名为 `你喜欢的英文角色名.yaml` - 在和 `yaml` 文件同个目录下,放入后续使用的参考音,命名为 `你喜欢的英文角色名_参考音频的文字内容.wav` 或 `.mp3`,例如 `Huanyuv2_你好,这是一条测试音频.wav` - 在 VideoLingo 网页的侧边栏中,将 `GPT-SoVITS 角色` 配置为 `你喜欢的英文角色名`。 b. 使用预训练模型: - 从 [这里](https://vip.123pan.cn/1817874751/8137723) 下载我的模型,解压后覆盖到 `GPT-SoVITS-v2-xxx`。 - 在 `GPT-SoVITS 角色` 配置为 `Huanyuv2`。 c. 使用其他训练好的模型: - 将 `xxx.ckpt` 模型文件放在 `GPT_weights_v2` 文件夹下,将 `xxx.pth` 模型文件放在 `SoVITS_weights_v2` 文件夹下。 - 参考方法 a,重命名 `tts_infer.yaml` 文件,并修改文件中的 `custom` 部分的 `t2s_weights_path` 和 `vits_weights_path` 指向你的模型,例如: ```yaml # 示例 法 b 的配置: t2s_weights_path: GPT_weights_v2/Huanyu_v2-e10.ckpt version: v2 vits_weights_path: SoVITS_weights_v2/Huanyu_v2_e10_s150.pth ``` - 参考方法 a,在和 `yaml` 文件同个目录下,放入后续使用的参考音频,命名为 `你喜欢的英文角色名_参考音频的文字内容.wav` 或 `.mp3`,例如 `Huanyuv2_你好,这是一条测试音频.wav`,程序会自动识别并使用。 - ⚠️ 警告:**请使用英文命名 `角色名`** ,否则会出现错误。 `参考音频的文字内容` 可以使用中文。目前仍处于测试版,可能产生报错。 ``` # 期望的目录结构: . ├── VideoLingo │ └── ... └── GPT-SoVITS-v2-xxx ├── GPT_SoVITS │ └── configs │ ├── tts_infer.yaml │ ├── 你喜欢的英文角色名.yaml │ └── 你喜欢的英文角色名_参考音频的文字内容.wav ├── GPT_weights_v2 │ └── [你的GPT模型文件] └── SoVITS_weights_v2 └── [你的SoVITS模型文件] ``` 配置完成后,注意在网页侧边栏选择 `参考音频模式`(具体原理可以参考语雀文档),VideoLingo 在配音步骤时会自动在弹出的命令行中打开 GPT-SoVITS 的推理 API 端口,配音完成后可手动关闭。注意,此方法的稳定性取决于选择的底模。
## 🛠️ 快速上手 VideoLingo 支持 Windows、macOS 和 Linux 系统,可使用 CPU 或 GPU 运行。 > **注意:** 在 Windows 上使用 NVIDIA GPU 加速,请先完成以下步骤: > 1. 安装 [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) 或更高版本(12.8 / 12.9 / 13.x 均可,安装脚本会自动适配) > 2. 安装 [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. 将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加到系统 PATH > 4. 重启电脑 > > ⚠️ **踩坑提示:** 安装脚本通过 `nvidia-smi` 检测驱动的 CUDA 版本,自动选择合适的 PyTorch 轮子(cu129 / cu128 / cu126)。RTX 50 系列(Blackwell)显卡会自动选择包含 sm_100 内核的 cu129 轮子。**不需要手动安装 cu130/cu131 的 PyTorch**——这会导致 ctranslate2 找不到 `cublas64_12.dll` 而报错。 > **注意:** FFmpeg 是必需的,请通过包管理器安装: > - Windows:```choco install ffmpeg```(通过 [Chocolatey](https://chocolatey.org/)) > - macOS:```brew install ffmpeg```(通过 [Homebrew](https://brew.sh/)) > - Linux:```sudo apt install ffmpeg```(Debian/Ubuntu) > > ⚠️ **踩坑提示:** 不要使用 conda-forge 的 ffmpeg(缺少 libmp3lame 编码器)。建议用系统包管理器安装完整版。 开始安装 VideoLingo 之前,请确保安装了 Git 和 Anaconda。 1. 克隆项目: ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. 创建并激活虚拟环境(**必须使用 3.10**): ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo ``` > ⚠️ **踩坑提示:** 请确保用的是 conda 环境里的 pip。如果 Windows 系统的 `site-packages` 不可写(如 `C:\ProgramData\anaconda3\`),pip 会悄悄把包装到用户目录,导致 conda 环境里找不到。遇到此问题可以用管理员权限运行终端。 3. 运行安装脚本: ```bash python install.py ``` > ⚠️ **安装顺序说明:** `install.py` 会按正确顺序安装依赖:先装 PyTorch(锁定 CUDA 版本),再用 `--no-deps` 装 demucs(避免 torchaudio 被降级),最后装其余依赖。**不要手动打乱顺序。** 4. 🎉 输入命令或双击 `OneKeyStart.bat` 启动 Streamlit 应用: ```bash streamlit run st.py ``` 5. 在弹出网页的侧边栏中设置key,开始使用~ ![tutorial](./zh_page.png) 6. (可选)更多设置可以在 `config.yaml` 中手动修改,运行过程请注意命令行输出。如需使用自定义术语,请在处理前将术语添加到 `custom_terms.xlsx` 中,例如 `Biden | 登子 | 美国的瞌睡总统`。 > 需要帮助?我们的 [AI助手](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) 随时解答问题! ## 🏭 批量模式(beta) 使用说明: [English](/batch/README.md) | [简体中文](/batch/README.zh.md) 这个模式仍处于早期开发阶段,可能有潜在的错误。 ## 🚨 常见报错与踩坑 1. **翻译过程的 'All array must be of the same length' 或 'Key Error'**: - 原因1:弱模型遵循JSON格式能力较弱导致响应解析错误。 - 原因2:对于敏感内容,LLM可能拒绝翻译。 解决方案:检查 `output/gpt_log/error.json` 的 `response` 和 `msg` 字段,删掉 `output/gpt_log` 文件夹后重试。 2. **'Retry Failed', 'SSL', 'Connection', 'Timeout'**: 通常是网络问题。解决方案:中国大陆用户请切换网络节点重试。 3. **local_files_only=True**:网络问题引起的模型下载失败,需要确认网络能 ping 通 `huggingface.co`。 4. **`cublas64_12.dll not found`**: 安装了 CUDA 13.x 后使用了 cu130/cu131 PyTorch 轮子。**解决方案:** 必须使用 cu129、cu128 或 cu126 轮子(`install.py` 已通过 `nvidia-smi` 自动检测处理),因为 ctranslate2 只支持 CUDA 12。重新运行 `python install.py` 即可。 5. **Whisper 模型加载时无报错直接段错误 (Segfault)**: ctranslate2 版本与 cuDNN 版本不匹配。**解决方案:** 确保 `ctranslate2>=4.5.0`(支持 cuDNN 9,PyTorch 2.6+ 自带 cuDNN 9)。 6. **`RuntimeError: Weights only load failed`**: PyTorch ≥2.6 更改了 `torch.load` 的默认行为。**解决方案:** 已在 `whisperX_local.py` 中通过猴补丁修复,如果遇到此问题说明代码未正确更新。 7. **Streamlit 中 WhisperX 转录卡住不动(CPU/GPU 均空闲)**: `librosa.load()` 在 Streamlit 的非主线程中死锁。**解决方案:** 已用 `whisperx.audio.load_audio()`(基于 ffmpeg 子进程)替换。如果遇到此问题说明代码未正确更新。 8. **spacy 模型 `Can't find model 'xx_core_web_md'`(但 pip 显示已安装)**: pip 将模型安装到了用户目录而非 conda 环境。**解决方案:** 用管理员权限运行终端,或手动指定 conda 环境的 python 路径安装: ```bash python -m pip install xx-core-web-md --no-user --force-reinstall --no-deps ``` 9. **pip 安装后 torchaudio 版本变成 1.x 或 2.1.x**: demucs 的 `torchaudio<2.2` 约束导致降级。**解决方案:** 不要手动 `pip install demucs`,必须用 `--no-deps` 安装。`install.py` 已正确处理。 ================================================ FILE: docs/pages/docs/tech.en-US.md ================================================ ## Videolingo Video Translation System Technical Documentation Videolingo is a highly integrated video translation system capable of automatically executing a series of complex operations, including video downloading, audio extraction, speech recognition, text processing, translation, subtitle generation, text-to-speech synthesis, and audio-video synthesis. The system leverages AI technologies (ASR, NLP, LLMs, TTS) and provides both a command-line interface for batch processing and an interactive web interface using Streamlit for task management and system configuration. The project has undergone significant refactoring, resulting in a more modular and robust architecture. Core functionalities are now organized into distinct packages and modules, primarily located within the `core` directory and its subdirectories (`asr_backend`, `spacy_utils`, `st_utils`, `tts_backend`, `utils`), as well as a dedicated `batch` directory for batch processing utilities, and a `translations` directory for internationalization. For developers, many components within the `core` directory – especially files named with the `_X_*.py` numbering scheme – represent distinct steps in the processing pipeline and can be executed individually for debugging purposes. Intermediate and final outputs are typically stored in the `output` directory, with mechanisms for cleanup and archival to the `history` directory. The following outlines the core technical modules and workflows: **1. Installation and Setup:** * `install.py`: The primary installation script. Automates dependency installation (including PyTorch with GPU/CPU detection), environment configuration (languages, PyPI mirror), FFmpeg checks, Noto font installation (Linux), and launching the Streamlit application. * `setup.py`: A standard Python project setup file, utilizing `setuptools`. Defines project metadata (name, version) and dependencies (read from `requirements.txt`) for packaging and installation via pip. **2. Video Acquisition Module:** * `core/_1_ytdlp.py`: Integrates the `yt-dlp` library to download videos from URLs. Handles `yt-dlp` updates, filename sanitization, resolution selection, and cookie usage for authenticated downloads. Also includes a function to locate a single video file within the output directory. **3. Audio Processing and Speech Recognition (ASR) Module (`core/asr_backend`):** * `core/asr_backend/demucs_vl.py`: Employs the Demucs model (`htdemucs`) to separate audio into vocal and background tracks, improving the quality of subsequent ASR. * `core/asr_backend/audio_preprocess.py`: Contains fundamental functions for preparing audio: volume normalization (`pydub`), video-to-audio conversion (`ffmpeg`), silence detection (`ffmpeg`), audio duration calculation (`ffmpeg`), splitting long audio files into manageable segments, processing ASR results into DataFrames, saving results, and storing detected languages. * `core/asr_backend/whisperX_local.py`: Implements local audio transcription using the WhisperX library. Optimizes performance based on available hardware (GPU/CPU), handles model downloads (with mirror checking), performs transcription and alignment, adjusts timestamps, and manages GPU memory. * `core/asr_backend/whisperX_302.py`: Implements audio transcription using the 302.ai WhisperX API, including caching and timestamp adjustment. * `core/asr_backend/elevenlabs_asr.py`: Implements audio transcription using the ElevenLabs Speech to Text API, handling audio slicing, API interaction, format conversion (ElevenLabs to Whisper-like format), and temporary file management. * `core/_2_asr.py`: Orchestrates the ASR process. Extracts audio, optionally performs Demucs vocal separation, splits audio, invokes the configured ASR backend (local WhisperX, 302 API, or Elevenlabs API), merges results, processes transcriptions into a DataFrame, and saves the output. **4. Text Processing and Translation Module (`core`, `core/spacy_utils`):** * **Sentence Splitting (`core/spacy_utils`):** * `core/spacy_utils/load_nlp_model.py`: Loads and initializes the appropriate spaCy NLP model based on the detected language, handling model downloads if necessary. * `core/spacy_utils/split_by_mark.py`: Performs initial sentence splitting using spaCy based on punctuation marks, with special handling for dashes and ellipses. * `core/spacy_utils/split_by_comma.py`: Further refines sentence splitting based on commas, utilizing spaCy to analyze grammatical validity. * `core/spacy_utils/split_by_connector.py`: Splits sentences based on linguistic connectors (conjunctions, relative pronouns) using spaCy, supporting multiple languages. * `core/spacy_utils/split_long_by_root.py`: Splits overly long sentences using spaCy's dependency parsing (identifying sentence subjects) and fallback length-based splitting. * `core/_3_1_split_nlp.py`: Orchestrates the spaCy-based splitting process, calling the various splitting functions (`split_by_mark`, `split_by_comma_main`, `split_sentences_main`, `split_long_by_root_main`). * **Meaning-Based Splitting and Translation:** * `core/_3_2_split_meaning.py`: Intelligently splits long sentences based on semantics using a GPT model, ensuring shorter and more manageable units for translation and subtitling. Leverages prompts defined in `core/prompts.py`. * `core/_4_1_summarize.py`: Uses an LLM (GPT) to generate summaries of video scripts and extract relevant terms (optionally augmented with custom terms from `custom_terms.xlsx`). Saves results to a JSON file. Leverages prompts defined in `core/prompts.py`. * `core/translate_lines.py`: Implements the core line-by-line translation logic using a GPT model. Employs a two-step approach (fidelity and expressiveness) for high-quality translation, incorporating context prompting and retry mechanisms. Leverages prompts defined in `core/prompts.py`. * `core/_4_2_translate.py`: Manages the overall translation process. Splits text into chunks, gathers context, calls `core/translate_lines.py` for parallel chunk translation, checks translation quality (similarity), aligns timestamps, trims text to fit audio durations, and saves results to Excel. **5. Subtitle Processing and Synthesis Module (`core`):** * `core/_5_split_sub.py`: Splits long translated subtitles into shorter segments suitable for display, using weighted length calculations and GPT-based alignment with source subtitles. Leverages prompts defined in `core/prompts.py`. * `core/_6_gen_sub.py`: Generates the final SRT subtitle files. Aligns translated text with source timestamps, cleans text, formats timestamps, handles small gaps, and generates various SRT output formats (source, translated, combined) for display and audio dubbing. * `core/_7_sub_into_vid.py`: Merges ("burns") the generated SRT subtitles (source and translated) directly into the video file using `ffmpeg`, with customizable styling and GPU acceleration support. If burning is disabled, creates a placeholder video. **6. Audio Dubbing Module (`core`, `core/tts_backend`):** * `core/_8_1_audio_task.py`: Parses the SRT file, merges short subtitles, cleans the text, trims text based on estimated duration using an LLM, and generates an Excel file (`_8_1_AUDIO_TASK.xlsx`) defining the tasks for the TTS engine. Leverages prompts defined in `core/prompts.py`. * `core/_8_2_dub_chunks.py`: Analyzes the audio task file, calculates time gaps and speaking rates, determines optimal cut points for dubbing chunks based on speed and pauses, merges lines where necessary, matches subtitles, and updates the task file. * `core/_9_refer_audio.py`: Extracts specific audio segments from the source vocal track based on timestamps defined in the audio task file, creating reference audio files used by certain TTS engines (e.g., GPT-SoVITS, F5-TTS, FishTTS). * **TTS Backends (`core/tts_backend`):** * `core/tts_backend/azure_tts.py`: Interface to the Azure Text-to-Speech API. * `core/tts_backend/custom_tts.py`: Placeholder/template for integrating custom TTS engines. * `core/tts_backend/edge_tts.py`: Interface to Microsoft Edge TTS using the `edge-tts` command-line tool. * `core/tts_backend/fish_tts.py`: Interface to the 302.ai Fish TTS API. * `core/tts_backend/gpt_sovits_tts.py`: Interface to a local GPT-SoVITS server, including server startup logic. * `core/tts_backend/openai_tts.py`: Interface to the OpenAI Text-to-Speech API. * `core/tts_backend/sf_cosyvoice2.py`: Interface to the SiliconFlow CosyVoice2 TTS API, supporting reference audio. * `core/tts_backend/sf_fishtts.py`: Interface to the SiliconFlow Fish TTS API, supporting preset, custom, and dynamic voice modes with reference audio. * `core/tts_backend/_302_f5tts.py`: Interface to the 302.ai F5-TTS API, which uses reference audio for voice cloning. * `core/tts_backend/estimate_duration.py`: Provides functions to estimate the speaking duration of text based on syllable counts and punctuation pauses for a given language. Used for audio task generation and subtitle trimming. * `core/tts_backend/tts_main.py`: Central TTS dispatcher. Cleans input text, selects the appropriate TTS backend based on configuration (`load_key("tts_method")`), calls the corresponding TTS function, handles errors using retries and GPT-based text correction, validates audio duration, and saves the output WAV file. * `core/_10_gen_audio.py`: Generates individual audio segments using the selected TTS backend via `tts_main.py`. Adjusts the speed of the generated audio using a computed factor (`ffmpeg`) to match target durations specified in the task file, and concatenates segments into chunks. Uses `ThreadPoolExecutor` for parallel processing. * `core/_11_merge_audio.py`: Merges the generated and speed-adjusted audio segments (`.wav` files from `output/audio_segments/`) into a single, continuous dubbed audio track (`output/dub.wav`), adding silences according to subtitle timings. Also generates a corresponding SRT file (`output/dub.srt`). * `core/_12_dub_to_vid.py`: The final synthesis step for dubbing. Merges the original video, the generated dubbed audio track (`output/dub.wav`), and the separated background music (`output/background.mp3`, if Demucs was used) using `ffmpeg`. Optionally burns subtitles during this process. Includes audio normalization. **7. Core Utilities and Configuration (`core/utils`):** * `core/prompts.py`: Defines standardized prompt templates used to guide the LLM (GPT) for tasks such as sentence splitting, summarization, translation (fidelity/expressiveness), subtitle alignment, and text optimization/correction for TTS. * `core/utils/ask_gpt.py`: Provides a robust interface (`ask_gpt` function) for interacting with the OpenAI GPT models. Includes caching (file-based), JSON response repair (`json_repair`), response validation, error handling with retries (`@except_handler`), and logging. * `core/utils/config_utils.py`: Utility functions (`load_key`, `update_key`) for loading and updating configuration settings from `config.yaml` using `ruamel.yaml` (preserves formatting) and `threading.Lock` for thread-safe access. Includes `get_joiner` for language-specific text concatenation. * `core/utils/decorator.py`: Defines reusable decorators: `except_handler` for adding retry logic and error reporting to functions, and `check_file_exists` for skipping function execution if the output file already exists. Uses `rich` for formatted output. * `core/utils/delete_retry_dubbing.py`: Provides a function (`delete_dubbing_files`) to clean up specific intermediate files and directories associated with the dubbing process (e.g., `dub.wav`, `output_dub.mp4`, `output/audio/segs`). * `core/utils/onekeycleanup.py`: Implements a `cleanup` function to organize and archive files from the `output` directory into a structured `history` directory based on the video name. Includes filename sanitization and robust file moving/deletion logic. * `core/utils/pypi_autochoose.py`: A utility for automatically testing and selecting the fastest PyPI mirror and configuring pip to use it. Uses `rich` for UI. * `core/utils/models.py`: Defines constants representing filepaths of various intermediate and output files used throughout the pipeline. * `core/__init__.py`, `core/asr_backend/__init__.py`, `core/spacy_utils/__init__.py`, `core/st_utils/__init__.py`, `core/tts_backend/__init__.py`: Package initialization files, defining the public interfaces (`__all__`) for their respective packages/subpackages. * `core/__init__.py`: Initializes the main `core` package, exporting key functions and modules from subpackages for easier access. **8. Batch Processing Module (`batch`):** * `batch/utils/settings_check.py`: Validates the settings defined in `batch/tasks_setting.xlsx` against the video files in `batch/input`, checking for file existence, valid URLs, and correct configuration values (e.g., dubbing flags). Uses `rich` for output. * `batch/utils/video_processor.py`: Defines the `process_video` function, which orchestrates the processing pipeline for *a single* video in a batch job. Handles input (URL or local file), calls the core processing steps (transcription, translation, subtitling, optional dubbing), manages retries, handles output folders, and invokes `cleanup`. * `batch/utils/batch_processor.py`: The main coordinator for batch processing. Reads tasks from `batch/tasks_setting.xlsx` (using `pandas`), iterates through the tasks, validates settings (`settings_check.py`), manages language configuration changes, calls `video_processor.py` for each video, handles errors and retries (including recovering files from the ERROR folder), and updates the status in the Excel file. Uses `rich` for console output. **9. Streamlit Interface Module (`core/st_utils`, `st.py`):** * `core/st_utils/download_video_section.py`: Implements the Streamlit UI section for selecting the input video, allowing users to download from YouTube (using `core/_1_ytdlp.py`) or upload local files (video or audio, with audio-to-video conversion using `ffmpeg`). * `core/st_utils/sidebar_setting.py`: Creates the configuration sidebar in the Streamlit UI. Allows users to set the display language, LLM parameters (API keys, model, base URL), subtitle settings (source/target languages, Demucs toggle, burning toggle), and dubbing settings (TTS method and related parameters such as voice, API key). Loads/saves settings using `core/utils/config_utils.py` and triggers `st.rerun()` on changes. Includes API key validation. * `core/st_utils/imports_and_utils.py`: Contains common imports and utility functions for the Streamlit application, such as functions for creating download buttons for zipped subtitle files and CSS styling for buttons. * `st.py`: The main entry point for the Streamlit web application. Sets up the page configuration, displays the logo, creates the sidebar using `sidebar_setting.py`, manages the main UI sections (video downloading/uploading via `download_video_section.py`, text processing, audio processing), and triggers the core processing functions (`process_text`, `process_audio`) based on user interaction (button clicks). Uses `st.spinner` to indicate progress during long-running operations. **10. Internationalization Module (`translations`):** * `translations/translations.py`: Implements UI translation functionality. Defines supported display languages, loads translated strings from JSON files based on the selected language (`load_key("display_language")`), and provides a `translate(key)` function to retrieve translated text, falling back to the original key if a translation is missing. Videolingo automates the complete process from video acquisition to the final generation of videos with translated subtitles and dubbing. The enhanced modular design allows each step to be more easily run and debugged, provides greater flexibility through multiple backend options (ASR, TTS), and offers improved configuration management and user interfaces for both interactive and batch processing workflows. ================================================ FILE: docs/pages/docs/tech.zh-CN.md ================================================ ## Videolingo 视频翻译系统技术文档 Videolingo 是一个高度集成的视频翻译系统,能够自动执行一系列复杂的操作,包括视频下载、音频提取、语音识别、文本处理、翻译、字幕生成、文本到语音合成以及音视频合成。该系统利用 AI 技术(ASR、NLP、LLMs、TTS),并提供用于批量处理的命令行界面和使用 Streamlit 的交互式 Web 界面,用于任务管理和系统配置。 该项目已经过重大重构,形成了一个更加模块化和健壮的结构。核心功能现在组织成不同的包和模块,主要位于 `core` 目录及其子目录(`asr_backend`、`spacy_utils`、`st_utils`、`tts_backend`、`utils`)中,以及一个专门的 `batch` 目录用于批量处理实用程序,以及一个 `translations` 目录用于国际化。 对于开发人员来说,`core` 目录中的许多组件(尤其是编号为 `_X_*.py` 的文件)代表处理管道中的不同步骤,并且可以单独执行以进行调试。中间输出和最终输出通常存储在 `output` 目录中,并具有清理和归档到 `history` 目录的机制。 以下概述核心技术模块和工作流程: **1. 安装和设置:** * `install.py`: 主要安装脚本。自动化依赖项安装(包括带有 GPU/CPU 检测的 PyTorch)、环境配置(语言、PyPI 镜像)、FFmpeg 检查、Noto 字体安装 (Linux) 以及启动 Streamlit 应用程序。 * `setup.py`: 标准 Python 项目设置文件,使用 `setuptools`。定义项目元数据(名称、版本)和依赖项(从 `requirements.txt` 读取),用于通过 pip 进行打包和安装。 **2. 视频获取模块:** * `core/_1_ytdlp.py`: 集成 `yt-dlp` 库以从 URL 下载视频。处理 `yt-dlp` 更新、文件名清理、分辨率选择以及用于身份验证下载的 cookie 用法。还包括一个在输出目录中查找单个视频文件的函数。 **3. 音频处理和语音识别 (ASR) 模块 (`core/asr_backend`):** * `core/asr_backend/demucs_vl.py`: 使用 Demucs 模型 (`htdemucs`) 将音频分离为人声和背景音轨,从而提高后续 ASR 的质量。 * `core/asr_backend/audio_preprocess.py`: 包含准备音频的基本功能:音量标准化 (`pydub`)、视频到音频的转换 (`ffmpeg`)、静音检测 (`ffmpeg`)、音频时长计算 (`ffmpeg`)、将长音频文件拆分为可管理的片段、将 ASR 结果处理为 DataFrames、保存结果以及存储检测到的语言。 * `core/asr_backend/whisperX_local.py`: 使用 WhisperX 库实现本地音频转录。根据可用硬件(GPU/CPU)优化性能,处理模型下载(具有镜像检查),执行转录和对齐,调整时间戳,并管理 GPU 内存。 * `core/asr_backend/whisperX_302.py`: 使用 302.ai WhisperX API 实现音频转录,包括缓存和时间戳调整。 * `core/asr_backend/elevenlabs_asr.py`: 使用 ElevenLabs 语音转文本 API 实现音频转录,处理音频切片、API 交互、格式转换(ElevenLabs 到类似 Whisper 的格式)和临时文件管理。 * `core/_2_asr.py`: 编排 ASR 过程。提取音频,可选择执行 Demucs 人声分离,拆分音频,调用配置的 ASR 后端(本地 WhisperX、302 API 或 Elevenlabs API),合并结果,将转录处理为 DataFrame,并保存输出。 **4. 文本处理和翻译模块 (`core`, `core/spacy_utils`):** * **句子拆分 (`core/spacy_utils`):** * `core/spacy_utils/load_nlp_model.py`: 根据检测到的语言加载和初始化适当的 spaCy NLP 模型,如果需要,处理模型下载。 * `core/spacy_utils/split_by_mark.py`: 使用 spaCy 基于标点符号执行初始句子拆分,并特殊处理破折号和省略号。 * `core/spacy_utils/split_by_comma.py`: 基于逗号进一步细化句子拆分,使用 spaCy 分析语法有效性。 * `core/spacy_utils/split_by_connector.py`: 使用 spaCy 基于语言连接词(连词、关系代词)拆分句子,支持多种语言。 * `core/spacy_utils/split_long_by_root.py`: 使用 spaCy 的依赖关系解析(识别句子主语)和基于回退长度的拆分来拆分过长的句子。 * `core/_3_1_split_nlp.py`: 编排基于 spaCy 的拆分过程,调用各种拆分函数(`split_by_mark`、`split_by_comma_main`、`split_sentences_main`、`split_long_by_root_main`)。 * **基于含义的拆分和翻译:** * `core/_3_2_split_meaning.py`: 使用 GPT 模型根据语义智能地拆分长句子,确保翻译和字幕的单元更短、更易于管理。利用 `core/prompts.py` 中定义的提示。 * `core/_4_1_summarize.py`: 使用 LLM (GPT) 生成视频脚本的摘要并提取相关术语(可以选择使用 `custom_terms.xlsx` 中的自定义术语进行增强)。将结果保存到 JSON 文件。利用 `core/prompts.py` 中定义的提示。 * `core/translate_lines.py`: 使用 GPT 模型实现核心的逐行翻译逻辑。采用两步法(忠实性和表达性)进行高质量翻译,结合上下文提示和重试机制。利用 `core/prompts.py` 中定义的提示。 * `core/_4_2_translate.py`: 管理整体翻译过程。将文本拆分为块,收集上下文,调用 `core/translate_lines.py` 进行并行块翻译,检查翻译质量(相似性),对齐时间戳,修剪文本以适应音频时长,并将结果保存到 Excel。 **5. 字幕处理和合成模块 (`core`):** * `core/_5_split_sub.py`: 将长的翻译字幕拆分为适合显示的较短片段,使用加权长度计算和基于 GPT 的与源字幕的对齐。利用 `core/prompts.py` 中定义的提示。 * `core/_6_gen_sub.py`: 生成最终的 SRT 字幕文件。将翻译后的文本与源时间戳对齐,清理文本,格式化时间戳,处理小间隙,并为显示和音频配音生成各种 SRT 输出格式(源、翻译、组合)。 * `core/_7_sub_into_vid.py`: 使用 `ffmpeg` 将生成的 SRT 字幕(源和翻译)直接合并("烧录")到视频文件中,具有可自定义的样式和 GPU 加速支持。如果禁用烧录,则创建一个占位符视频。 **6. 音频配音模块 (`core`, `core/tts_backend`):** * `core/_8_1_audio_task.py`: 解析 SRT 文件,合并短字幕,清理文本,使用 LLM 根据估计的时长修剪文本,并生成一个 Excel 文件 (`_8_1_AUDIO_TASK.xlsx`),用于定义 TTS 引擎的任务。利用 `core/prompts.py` 中定义的提示。 * `core/_8_2_dub_chunks.py`: 分析音频任务文件,计算时间间隙和语速,根据速度和停顿确定配音块的最佳切断点,必要时合并行,匹配字幕,并更新任务文件。 * `core/_9_refer_audio.py`: 基于音频任务文件中定义的时间戳,从源人声音轨中提取特定的音频片段,创建某些 TTS 引擎(如 GPT-SoVITS、F5-TTS、FishTTS)使用的参考音频文件。 * **TTS 后端 (`core/tts_backend`):** * `core/tts_backend/azure_tts.py`: Azure 文本转语音 API 的接口。 * `core/tts_backend/custom_tts.py`: 用于集成自定义 TTS 引擎的占位符/模板。 * `core/tts_backend/edge_tts.py`: 使用 `edge-tts` 命令行工具的 Microsoft Edge TTS 的接口。 * `core/tts_backend/fish_tts.py`: 302.ai Fish TTS API 的接口。 * `core/tts_backend/gpt_sovits_tts.py`: 本地 GPT-SoVITS 服务器的接口,包括服务器启动逻辑。 * `core/tts_backend/openai_tts.py`: OpenAI 文本转语音 API 的接口。 * `core/tts_backend/sf_cosyvoice2.py`: SiliconFlow CosyVoice2 TTS API 的接口,支持参考音频。 * `core/tts_backend/sf_fishtts.py`: SiliconFlow Fish TTS API 的接口,支持具有参考音频的预设、自定义和动态语音模式。 * `core/tts_backend/_302_f5tts.py`: 302.ai F5-TTS API 的接口,使用参考音频进行语音克隆。 * `core/tts_backend/estimate_duration.py`: 提供根据特定语言的音节计数和标点符号停顿来估计文本的说话时长的函数。用于音频任务生成和字幕修剪。 * `core/tts_backend/tts_main.py`: 中央 TTS 调度器。清理输入文本,根据配置 (`load_key("tts_method")`) 选择适当的 TTS 后端,调用相应的 TTS 函数,使用重试和基于 GPT 的文本纠正来处理错误,验证音频时长,并保存输出 WAV 文件。 * `core/_10_gen_audio.py`: 使用选定的 TTS 后端通过 `tts_main.py` 生成单独的音频片段。基于计算的因子调整生成的音频速度 (`ffmpeg`) 以适应任务文件中指定的目标时长,并将片段合并为块。使用 `ThreadPoolExecutor` 处理并行处理。 * `core/_11_merge_audio.py`: 将生成的和速度调整的音频片段(来自 `output/audio_segments/` 的 `.wav` 文件)合并为单个连续的配音音轨 (`output/dub.wav`),根据字幕时序添加静音。 还生成相应的 SRT 文件 (`output/dub.srt`)。 * `core/_12_dub_to_vid.py`: 配音的最终合成步骤。使用 `ffmpeg` 合并原始视频、生成的配音音轨 (`output/dub.wav`) 和分离的背景音乐 (`output/background.mp3`,如果使用了 Demucs)。可选择在此过程中烧录字幕。包括音频标准化。 **7. 核心实用程序和配置 (`core/utils`):** * `core/prompts.py`: 定义标准化的提示模板,用于指导 LLM(GPT)完成诸如句子拆分、摘要、翻译(忠实性/表达性)、字幕对齐以及文本优化/校正以进行 TTS 等任务。 * `core/utils/ask_gpt.py`: 提供一个强大的接口(`ask_gpt` 函数)用于与 OpenAI GPT 模型交互。 包括缓存(基于文件)、JSON 响应修复 (`json_repair`)、响应验证、带重试的错误处理 (`@except_handler`) 和日志记录。 * `core/utils/config_utils.py`: 实用程序函数 (`load_key`, `update_key`),用于使用 `ruamel.yaml`(保留格式)和 `threading.Lock` 以线程安全的方式从 `config.yaml` 加载和更新配置设置。包括 `get_joiner` 用于特定语言的文本连接。 * `core/utils/decorator.py`: 定义可重用的装饰器:`except_handler` 用于向函数添加重试逻辑和错误报告,`check_file_exists` 用于如果输出文件已存在则跳过函数执行。 使用 `rich` 进行格式化的输出。 * `core/utils/delete_retry_dubbing.py`: 提供一个函数 (`delete_dubbing_files`) 来清理与配音过程相关的特定中间文件和目录(例如,`dub.wav`、`output_dub.mp4`、`output/audio/segs`)。 * `core/utils/onekeycleanup.py`: 实现 `cleanup` 函数,用于将文件从 `output` 目录组织和归档到基于视频名称的结构化的 `history` 目录中。包括文件名清理和强大的文件移动/删除逻辑。 * `core/utils/pypi_autochoose.py`: 用于自动测试和选择最快的 PyPI 镜像并配置 pip 以使用它的实用程序。 使用 `rich` 进行 UI。 * `core/utils/models.py`: 定义表示整个管道中使用的各种中间文件和输出文件的文件路径的常量。 * `core/__init__.py`, `core/asr_backend/__init__.py`, `core/spacy_utils/__init__.py`, `core/st_utils/__init__.py`, `core/tts_backend/__init__.py`: 包初始化文件,定义其各自包/子包的公共接口 (`__all__`)。 * `core/__init__.py`: 初始化主 `core` 包,从子包导出关键函数和模块,以便更轻松地访问。 **8. 批量处理模块 (`batch`):** * `batch/utils/settings_check.py`: 根据 `batch/input` 中的视频文件验证 `batch/tasks_setting.xlsx` 中定义的设置,检查文件是否存在、有效的 URL 和正确的配置值(例如,配音标志)。使用 `rich` 进行输出。 * `batch/utils/video_processor.py`: 定义 `process_video` 函数,该函数编排批处理作业中*单个*视频的处理管道。 处理输入(URL 或本地文件),调用核心处理步骤(转录、翻译、字幕、可选配音),并进行重试,管理输出文件夹,并调用 `cleanup`。 * `batch/utils/batch_processor.py`: 批量处理的主协调器。从 `batch/tasks_setting.xlsx` 读取任务(使用 `pandas`),迭代任务,验证设置 (`settings_check.py`),管理语言配置更改,为每个视频调用 `video_processor.py`,处理错误并重试(包括从 ERROR 文件夹恢复文件),并更新 Excel 文件中的状态。 使用 `rich` 进行控制台输出。 **9. Streamlit 界面模块 (`core/st_utils`, `st.py`):** * `core/st_utils/download_video_section.py`: 实现用于选择输入视频的 Streamlit UI 部分,允许用户从 YouTube 下载(使用 `core/_1_ytdlp.py`)或上传本地文件(视频或音频,使用 `ffmpeg` 进行音频到视频的转换)。 * `core/st_utils/sidebar_setting.py`: 在 Streamlit UI 中创建配置侧边栏。 允许用户设置显示语言、LLM 参数(API 密钥、模型、基本 URL)、字幕设置(识别/目标语言、Demucs 开关、烧录开关)和配音设置(TTS 方法和相关参数,如语音、API 密钥)。使用 `core/utils/config_utils.py` 加载/保存设置,并在更改时触发 `st.rerun()`。包括 API 密钥验证。 * `core/st_utils/imports_and_utils.py`: 包含 Streamlit 应用程序的通用导入和实用程序函数,例如创建压缩字幕文件的下载按钮的函数以及按钮的 CSS 样式。 * `st.py`: Streamlit Web 应用程序的主要入口点。设置页面配置,显示徽标,使用 `sidebar_setting.py` 创建侧边栏,管理主 UI 部分(通过 `download_video_section.py` 进行视频下载/上传,文本处理,音频处理),并根据用户交互(按钮点击)触发核心处理函数(`process_text`,`process_audio`)。使用 `st.spinner` 指示长时间操作期间的进度。 **10. 国际化模块 (`translations`):** * `translations/translations.py`: 实现 UI 的翻译功能。 定义支持的显示语言,根据选定的语言 (`load_key("display_language")`) 从 JSON 文件加载翻译字符串,并提供 `translate(key)` 函数来检索翻译后的文本,如果缺少翻译,则回退到原始键。 Videolingo实现了从视频获取到最终生成具有翻译字幕和配音的视频的完整流程自动化。 增强的模块化设计使每个步骤都可以更轻松地运行和调试,通过多个后端选项(ASR、TTS)提供更大的灵活性,并为交互式和批量处理工作流程提供改进的配置管理和用户界面。 ================================================ FILE: docs/pages/globals.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; @layer base { :root { --background: 0 0% 100%; --foreground: 222.2 84% 4.9%; --card: 0 0% 100%; --card-foreground: 222.2 84% 4.9%; --popover: 0 0% 100%; --popover-foreground: 222.2 84% 4.9%; --primary: 222.2 47.4% 11.2%; --primary-foreground: 210 40% 98%; --secondary: 210 40% 96.1%; --secondary-foreground: 222.2 47.4% 11.2%; --muted: 210 40% 96.1%; --muted-foreground: 215.4 16.3% 46.9%; --accent: 210 40% 96.1%; --accent-foreground: 222.2 47.4% 11.2%; --destructive: 0 84.2% 60.2%; --destructive-foreground: 210 40% 98%; --border: 214.3 31.8% 91.4%; --input: 214.3 31.8% 91.4%; --ring: 222.2 84% 4.9%; --radius: 0.5rem; --chart-1: 12 76% 61%; --chart-2: 173 58% 39%; --chart-3: 197 37% 24%; --chart-4: 43 74% 66%; --chart-5: 27 87% 67%; --color-1: 0 100% 63%; --color-2: 270 100% 63%; --color-3: 210 100% 63%; --color-4: 195 100% 63%; --color-5: 90 100% 63%; } .dark { --background: 222.2 84% 4.9%; --foreground: 210 40% 98%; --card: 222.2 84% 4.9%; --card-foreground: 210 40% 98%; --popover: 222.2 84% 4.9%; --popover-foreground: 210 40% 98%; --primary: 210 40% 98%; --primary-foreground: 222.2 47.4% 11.2%; --secondary: 217.2 32.6% 17.5%; --secondary-foreground: 210 40% 98%; --muted: 217.2 32.6% 17.5%; --muted-foreground: 215 20.2% 65.1%; --accent: 217.2 32.6% 17.5%; --accent-foreground: 210 40% 98%; --destructive: 0 62.8% 30.6%; --destructive-foreground: 210 40% 98%; --border: 217.2 32.6% 17.5%; --input: 217.2 32.6% 17.5%; --ring: 212.7 26.8% 83.9%; --chart-1: 220 70% 50%; --chart-2: 160 60% 45%; --chart-3: 30 80% 55%; --chart-4: 280 65% 60%; --chart-5: 340 75% 55%; --color-1: 0 100% 63%; --color-2: 270 100% 63%; --color-3: 210 100% 63%; --color-4: 195 100% 63%; --color-5: 90 100% 63%; } } @layer base { * { @apply border-border; } body { @apply bg-background text-foreground; } } .n-card { @apply bg-[#F8F8F7] border-[#706b5740] rounded-2xl shadow-none; } ================================================ FILE: docs/pages/index.en-US.mdx ================================================ --- title: VideoLingo --- import Landing from '@/components/landing' export const getStaticProps = ({ params }) => { return Promise.all([ fetch(`https://api.github.com/repos/Huanshere/VideoLingo`).then(res => res.json()), fetch(`https://api.github.com/repos/Huanshere/VideoLingo/contributors?per_page=16`).then(res => res.json()) ]).then(([repo, stargazers]) => ({ props: { ssg: { stars: repo.stargazers_count, recentStargazers: stargazers } }, revalidate: 60 })) } export default function Component() { const landingData = { hero: { title: "VideoLingo: Connecting Every Frame Across the World", description: "Netflix-level subtitle cutting, translation, alignment, and even dubbing - one-click fully automated video localization AI subtitle team", videoSrc: "/videos/demo.mp4" }, features: { title: "Powerful Features, Unleash Creativity", items: [ { title: 'Intelligent Subtitle Segmentation', description: 'Using NLP and LLM technologies to accurately segment subtitles based on sentence meaning, ensuring each phrase is just right.', icon: 'CheckCircle', }, { title: 'Context-Aware Translation', description: 'GPT summarizes and extracts terminology knowledge base, achieving context-coherent translation, making every sentence natural and fluent.', icon: 'ArrowRight', }, { title: 'Three-Step Translation Process', description: 'Direct translation - Reflection - Paraphrasing, multiple safeguards, rivaling the quality of professional subtitle team translations.', icon: 'CheckCircle', }, { title: 'Precise Subtitle Alignment', description: 'Using WhisperX for word-level timeline subtitle recognition, ensuring every word is accurately synchronized.', icon: 'ArrowRight', }, { title: 'High-Quality Dubbing', description: 'Supports various TTS solutions, including high-quality personalized dubbing with GPT-SoVITS technology, making videos more appealing.', icon: 'CheckCircle', }, { title: 'Developer-Friendly', description: 'Structured file design, convenient for developers to customize and extend functionality. Supports multiple deployment methods.', icon: 'ArrowRight', }, ] }, comments: { title: "They're All Using VideoLingo", items: [ { content: "What used to take a whole day now gets done in an hour!", author: "k", title: "Bilibili creator with 300k followers" }, { content: "This dubbing is even more accurate than my own speech, I suddenly have so many fun ideas 🤩", author: "Ah Biao", title: "Xiaohongshu Cantonese creator with 100k followers" }, { content: "I just posted it for fun after work, didn't expect it to blow up so quickly 😂", author: "X", title: "Douyin creator gaining 7k followers daily" } ] }, faq: { title: "Frequently Asked Questions", items: [ { question: "How is the translation quality?", description: "We strictly adhere to Netflix subtitle standards, using the most advanced Claude 3.5 model for multi-step translation." }, { question: "How long does it take to process a video?", answer: "Processing time depends on the length of the video and the selected services. Typically, a 60-minute video takes about 40 minutes to complete translation and dubbing." }, { question: "How is it priced?", answer: "VideoLingo is an open-source project that has already gained 3k+ stars on Github. A commercial version with more features is coming soon~" }, ] } } return } ================================================ FILE: docs/pages/index.ja.mdx ================================================ --- title: VideoLingo --- import Landing from '@/components/landing' export const getStaticProps = ({ params }) => { return Promise.all([ fetch(`https://api.github.com/repos/Huanshere/VideoLingo`).then(res => res.json()), fetch(`https://api.github.com/repos/Huanshere/VideoLingo/contributors?per_page=16`).then(res => res.json()) ]).then(([repo, stargazers]) => ({ props: { ssg: { stars: repo.stargazers_count, recentStargazers: stargazers } }, revalidate: 60 })) } export default function Component() { const landingData = { hero: { title: "VideoLingo: 世界の一コマ一コマをつなぐ", description: "Netflixレベルの字幕分割、翻訳、同期、さらに吹き替えまで、ワンクリックで全自動の動画翻訳AIチーム", videoSrc: "/videos/demo.mp4" }, features: { title: "強力な機能で創造性を解き放つ", items: [ { title: 'インテリジェントな字幕分割', description: 'NLPとLLM技術を使用し、文意に基づいて字幕を正確に分割し、各フレーズが適切であることを保証します。', icon: 'CheckCircle', }, { title: 'コンテキスト認識翻訳', description: 'GPTで用語知識ベースを要約・抽出し、文脈に沿った一貫性のある翻訳を実現。自然で流暢な翻訳を提供します。', icon: 'ArrowRight', }, { title: '3ステップ翻訳プロセス', description: '直接翻訳 - 反省 - 意訳の多重保証で、プロの字幕翻訳チームに匹敵する品質を実現。', icon: 'CheckCircle', }, { title: '精密な字幕同期', description: 'WhisperXを使用して単語レベルのタイムライン字幕認識を行い、すべての単語を正確に同期させます。', icon: 'ArrowRight', }, { title: '高品質な吹き替え', description: 'GPT-SoVITS技術を含む複数のTTSソリューションをサポートし、高品質でパーソナライズされた吹き替えで動画をより魅力的に。', icon: 'CheckCircle', }, { title: '開発者フレンドリー', description: '構造化されたファイル設計で、開発者によるカスタマイズと機能拡張が容易。複数のデプロイメント方法をサポート。', icon: 'ArrowRight', }, ] }, comments: { title: "VideoLingoユーザーの声", items: [ { content: "以前は1日かかっていたことが、今では1時間で完了します!", author: "k", title: "ビリビリ動画30万フォロワーの配信者" }, { content: "この吹き替えは私の話し方よりも正確で、たくさんの面白いアイデアが浮かびました🤩", author: "アービョウ", title: "小紅書10万フォロワーの広東語配信者" }, { content: "仕事帰りに単純に遊びで投稿したら、思わぬ大ヒットになりました😂", author: "X", title: "Douyin(TikTok中国版)で1日7000フォロワー増の配信者" } ] }, faq: { title: "よくある質問", items: [ { question: "翻訳の品質はどうですか?", description: "Netflixの字幕基準に厳密に従い、最先端のClaude 3.5モデルを使用して多段階の翻訳を行っています。" }, { question: "1つの動画の処理にどのくらい時間がかかりますか?", answer: "処理時間は動画の長さと選択したサービスによって異なります。通常、60分の動画の翻訳と吹き替えには約40分かかります。" }, { question: "料金はどうなっていますか?", answer: "VideoLingoはオープンソースプロジェクトで、すでにGitHubで3k以上のスターを獲得しています。商用版の発表が間もなく予定されており、さらに多くの機能が追加される予定です。" }, ] } } return } ================================================ FILE: docs/pages/index.zh-CN.mdx ================================================ --- title: VideoLingo --- import Landing from '@/components/landing' export const getStaticProps = async ({ params }) => { const [repo, stargazers] = await Promise.all([ fetch(`https://api.github.com/repos/Huanshere/VideoLingo`).then(res => res.json()), fetch(`https://api.github.com/repos/Huanshere/VideoLingo/contributors?per_page=16`).then(res => res.json()) ]); return { props: { ssg: { stars: repo.stargazers_count !== undefined ? repo.stargazers_count : null, recentStargazers: stargazers } }, revalidate: 60 }; } export default function Component() { const landingData = { hero: { title: "VideoLingo: 连接世界的每一帧", description: "Netflix级字幕切割、翻译、对齐、甚至加上配音,一键全自动视频搬运AI字幕组", videoSrc: "/videos/demo.mp4" }, features: { title: "强大功能,释放创意", items: [ { title: '智能字幕分割', description: '使用 NLP 和 LLM 技术,根据句意精确分割字幕,确保每一句话都恰到好处。', icon: 'CheckCircle', }, { title: '上下文感知翻译', description: 'GPT 总结提取术语知识库,实现上下文连贯翻译,让每一句翻译都自然流畅。', icon: 'ArrowRight', }, { title: '三步翻译过程', description: '直接翻译 - 反思 - 意译,多重保障,媲美字幕组精翻效果。', icon: 'CheckCircle', }, { title: '精确字幕对齐', description: '使用 WhisperX 进行单词级时间轴字幕识别,让每一个字都准确同步。', icon: 'ArrowRight', }, { title: '高质量配音', description: '支持多种 TTS 方案,包括 GPT-SoVITS 技术的高质量个性化配音,让视频更具魅力。', icon: 'CheckCircle', }, { title: '开发者友好', description: '结构化文件设计,方便开发者自定义和扩展功能。支持多种部署方式。', icon: 'ArrowRight', }, ] }, comments: { title: "他们都在用 VideoLingo", items: [ { content: "之前要弄一整天,现在一个小时就弄完了!", author: "k", title: "B站30w粉up" }, { content: "这个配音比我说得还准,我一下就有好多好玩的想法了🤩", author: "阿标", title: "小红书10w粉粤语up" }, { content: "下班单纯发着玩,没想到一下就爆了😂", author: "X", title: "抖音日涨7k粉up" } ] }, faq: { title: "常见问题", items: [ { question: "翻译的质量如何?", description: "我们严格参照 Netflix 字幕标准,使用最先进的 Claude 3.5 模型进行多步骤翻译。" }, { question: "处理一个视频需要多长时间?", answer: "处理时间取决于视频的长度和所选的服务。通常,一个 60 分钟的视频完成翻译和配音大约需要 40 分钟。" }, { question: "如何收费呢?", answer: "VideoLingo是开源项目,已经在Github上获得3k+ stars,即将推出商业版,会带来更多功能~" }, ] } } return } ================================================ FILE: docs/postcss.config.js ================================================ module.exports = { plugins: { tailwindcss: {}, autoprefixer: {}, }, } ================================================ FILE: docs/public/site.webmanifest ================================================ { "name": "MyWebSite", "short_name": "MySite", "icons": [ { "src": "/web-app-manifest-192x192.png", "sizes": "192x192", "type": "image/png", "purpose": "maskable" }, { "src": "/web-app-manifest-512x512.png", "sizes": "512x512", "type": "image/png", "purpose": "maskable" } ], "theme_color": "#ffffff", "background_color": "#ffffff", "display": "standalone" } ================================================ FILE: docs/tailwind.config.js ================================================ /** @type {import('tailwindcss').Config} */ module.exports = { darkMode: ["class"], content: [ './pages/**/*.{js,jsx,ts,tsx,md,mdx}', './components/**/*.{js,jsx,ts,tsx,md,mdx}', // Or if using `src` directory: './src/**/*.{js,jsx,ts,tsx,md,mdx}' ], prefix: "", theme: { container: { center: 'true', padding: '2rem', screens: { '2xl': '1400px' } }, extend: { colors: { border: 'hsl(var(--border))', input: 'hsl(var(--input))', ring: 'hsl(var(--ring))', background: 'hsl(var(--background))', foreground: 'hsl(var(--foreground))', primary: { DEFAULT: 'hsl(var(--primary))', foreground: 'hsl(var(--primary-foreground))' }, secondary: { DEFAULT: 'hsl(var(--secondary))', foreground: 'hsl(var(--secondary-foreground))' }, destructive: { DEFAULT: 'hsl(var(--destructive))', foreground: 'hsl(var(--destructive-foreground))' }, muted: { DEFAULT: 'hsl(var(--muted))', foreground: 'hsl(var(--muted-foreground))' }, accent: { DEFAULT: 'hsl(var(--accent))', foreground: 'hsl(var(--accent-foreground))' }, popover: { DEFAULT: 'hsl(var(--popover))', foreground: 'hsl(var(--popover-foreground))' }, card: { DEFAULT: 'hsl(var(--card))', foreground: 'hsl(var(--card-foreground))' }, 'color-1': 'hsl(var(--color-1))', 'color-2': 'hsl(var(--color-2))', 'color-3': 'hsl(var(--color-3))', 'color-4': 'hsl(var(--color-4))', 'color-5': 'hsl(var(--color-5))', 'color-1': 'hsl(var(--color-1))', 'color-2': 'hsl(var(--color-2))', 'color-3': 'hsl(var(--color-3))', 'color-4': 'hsl(var(--color-4))', 'color-5': 'hsl(var(--color-5))' }, borderRadius: { lg: 'var(--radius)', md: 'calc(var(--radius) - 2px)', sm: 'calc(var(--radius) - 4px)' }, keyframes: { 'accordion-down': { from: { height: '0' }, to: { height: 'var(--radix-accordion-content-height)' } }, 'accordion-up': { from: { height: 'var(--radix-accordion-content-height)' }, to: { height: '0' } }, rainbow: { '0%': { 'background-position': '0%' }, '100%': { 'background-position': '200%' }, '0%': { 'background-position': '0%' }, '100%': { 'background-position': '200%' } } }, animation: { 'accordion-down': 'accordion-down 0.2s ease-out', 'accordion-up': 'accordion-up 0.2s ease-out', rainbow: 'rainbow var(--speed, 2s) infinite linear' } } }, plugins: [require("tailwindcss-animate")], } ================================================ FILE: docs/theme.config.jsx ================================================ import { useRouter } from 'next/router' import { useConfig } from 'nextra-theme-docs' const title = 'VideoLingo' export default { logo: {title}, project: { link: 'https://github.com/Huanshere/VideoLingo', }, footer: { text: {new Date().getFullYear()} © {title}., }, i18n: [ { locale: 'en-US', text: 'English' }, { locale: 'zh-CN', text: '中文' }, { locale: 'ja', text: '日本語' }, ], head: () => { const { asPath, defaultLocale, locale } = useRouter() const { frontMatter } = useConfig() const url = 'https://videolingo.io' + (defaultLocale === locale ? asPath : `/${locale}${asPath}`) return ( <> ) }, useNextSeoProps() { const { asPath } = useRouter() if (asPath !== '/') { return { titleTemplate: `%s | ${title}`, } } }, } ================================================ FILE: docs/tsconfig.json ================================================ { "compilerOptions": { "target": "es5", "lib": ["dom", "dom.iterable", "esnext"], "allowJs": true, "skipLibCheck": true, "strict": false, "forceConsistentCasingInFileNames": true, "noEmit": true, "incremental": true, "esModuleInterop": true, "module": "esnext", "moduleResolution": "node", "resolveJsonModule": true, "isolatedModules": true, "jsx": "preserve", "baseUrl": ".", "paths": { "@/*": ["./*"] }, "plugins": [ { "name": "next" } ], "strictNullChecks": true }, "include": [ "next-env.d.ts", "types.d.ts", "**/*.ts", "**/*.tsx", "tailwind.config.js", "pages/_meta.js", ".next/types/**/*.ts" ], "exclude": ["node_modules", "components/magicui", "components/ui"] } ================================================ FILE: install.py ================================================ import os, sys import platform import subprocess sys.path.append(os.path.dirname(os.path.abspath(__file__))) ascii_logo = """ __ ___ _ _ _ \ \ / (_) __| | ___ ___ | | (_)_ __ __ _ ___ \ \ / /| |/ _` |/ _ \/ _ \| | | | '_ \ / _` |/ _ \ \ V / | | (_| | __/ (_) | |___| | | | | (_| | (_) | \_/ |_|\__,_|\___|\___/|_____|_|_| |_|\__, |\___/ |___/ """ def install_package(*packages): subprocess.check_call([sys.executable, "-m", "pip", "install", *packages]) def check_nvidia_gpu(): install_package("nvidia-ml-py") import pynvml from translations.translations import translate as t initialized = False try: pynvml.nvmlInit() initialized = True device_count = pynvml.nvmlDeviceGetCount() if device_count > 0: print(t("Detected NVIDIA GPU(s)")) for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) print(f"GPU {i}: {name}") return True else: print(t("No NVIDIA GPU detected")) return False except pynvml.NVMLError: print(t("No NVIDIA GPU detected or NVIDIA drivers not properly installed")) return False finally: if initialized: pynvml.nvmlShutdown() def check_ffmpeg(): from rich.console import Console from rich.panel import Panel from translations.translations import translate as t console = Console() try: # Check if ffmpeg is installed subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) console.print(Panel(t("✅ FFmpeg is already installed"), style="green")) except (subprocess.CalledProcessError, FileNotFoundError): system = platform.system() install_cmd = "" if system == "Windows": install_cmd = "choco install ffmpeg" extra_note = t("Install Chocolatey first (https://chocolatey.org/)") elif system == "Darwin": install_cmd = "brew install ffmpeg" extra_note = t("Install Homebrew first (https://brew.sh/)") elif system == "Linux": install_cmd = "sudo apt install ffmpeg # Ubuntu/Debian\nsudo yum install ffmpeg # CentOS/RHEL" extra_note = t("Use your distribution's package manager") console.print(Panel.fit( t("❌ FFmpeg not found\n\n") + f"{t('🛠️ Install using:')}\n[bold cyan]{install_cmd}[/bold cyan]\n\n" + f"{t('💡 Note:')}\n{extra_note}\n\n" + f"{t('🔄 After installing FFmpeg, please run this installer again:')}\n[bold cyan]python install.py[/bold cyan]", style="red" )) raise SystemExit(t("FFmpeg is required. Please install it and run the installer again.")) # Warn if ffmpeg lacks libmp3lame (common with conda-forge builds) try: result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10) if 'libmp3lame' not in result.stdout: console.print(Panel.fit( "⚠️ Your ffmpeg does not include [bold]libmp3lame[/bold] (MP3 encoder).\n" "This is common with conda-forge ffmpeg builds.\n\n" "VideoLingo will fall back to WAV encoding automatically, but for\n" "smaller intermediate files, consider installing a full ffmpeg:\n\n" "[bold cyan]" + ( "winget install Gyan.FFmpeg" if platform.system() == "Windows" else "brew install ffmpeg" if platform.system() == "Darwin" else "sudo apt install ffmpeg" ) + "[/bold cyan]", style="yellow" )) except Exception: pass def _detect_cuda_version_from_smi(): """Detect CUDA version from nvidia-smi output (driver's CUDA capability).""" import re try: result = subprocess.run( ["nvidia-smi"], capture_output=True, text=True, timeout=10 ) m = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", result.stdout) if m: return (int(m.group(1)), int(m.group(2))) except Exception: pass return None def _detect_cuda_index(): """Detect the CUDA version and return the best PyTorch wheel index URL. Falls back to cu126 when detection fails. For RTX 50 series (Blackwell architecture, compute capability 10.0+), we need PyTorch wheels compiled with CUDA 12.8+ that include sm_100 kernels. We prefer nvidia-smi (driver CUDA version) over nvcc (toolkit version) because: - Driver version determines what CUDA features the GPU can run at runtime - Toolkit version is for compilation, not runtime compatibility - Blackwell GPUs need cu129+ wheels even if user has older CUDA toolkit installed """ cuda_version = _detect_cuda_version_from_smi() # Map CUDA major.minor to PyTorch wheel index. # For CUDA 13.x (RTX 50 series / Blackwell), use cu129 which includes sm_100 kernels. INDEX = "https://download.pytorch.org/whl" CU_TAGS = [ ((13, 0), "cu129"), # CUDA 13.x (Blackwell / RTX 50 series) ((12, 9), "cu129"), # CUDA 12.9+ ((12, 8), "cu128"), # CUDA 12.8+ ((12, 6), "cu126"), # CUDA 12.6+ ] if cuda_version: for min_ver, tag in CU_TAGS: if cuda_version >= min_ver: return f"{INDEX}/{tag}" # Default: cu126 is the broadest CUDA 12 index for PyTorch 2.8 return f"{INDEX}/cu126" def main(): install_package("requests", "rich", "ruamel.yaml", "InquirerPy") from rich.console import Console from rich.panel import Panel from rich.box import DOUBLE from InquirerPy import inquirer from translations.translations import translate as t from translations.translations import DISPLAY_LANGUAGES from core.utils.config_utils import load_key, update_key from core.utils.decorator import except_handler console = Console() width = max(len(line) for line in ascii_logo.splitlines()) + 4 welcome_panel = Panel( ascii_logo, width=width, box=DOUBLE, title="[bold green]🌏[/bold green]", border_style="bright_blue" ) console.print(welcome_panel) # Language selection current_language = load_key("display_language") # Find the display name for current language code current_display = next((k for k, v in DISPLAY_LANGUAGES.items() if v == current_language), "🇬🇧 English") selected_language = DISPLAY_LANGUAGES[inquirer.select( message="Select language / 选择语言 / 選擇語言 / 言語を選択 / Seleccionar idioma / Sélectionner la langue / Выберите язык:", choices=list(DISPLAY_LANGUAGES.keys()), default=current_display ).execute()] update_key("display_language", selected_language) console.print(Panel.fit(t("🚀 Starting Installation"), style="bold magenta")) # Configure mirrors # add a check to ask user if they want to configure mirrors if inquirer.confirm( message=t("Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)"), default=True ).execute(): from core.utils.pypi_autochoose import main as choose_mirror choose_mirror() # Detect system and GPU has_gpu = platform.system() != 'Darwin' and check_nvidia_gpu() if has_gpu: console.print(Panel(t("🎮 NVIDIA GPU detected, installing CUDA version of PyTorch..."), style="cyan")) cuda_index = _detect_cuda_index() console.print(f"[cyan]📦 Using PyTorch index:[/cyan] {cuda_index}") subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchaudio==2.8.0", "--index-url", cuda_index]) else: system_name = "🍎 MacOS" if platform.system() == 'Darwin' else "💻 No NVIDIA GPU" console.print(Panel(t(f"{system_name} detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription."), style="cyan")) subprocess.check_call([sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchaudio==2.8.0"]) @except_handler("Failed to install project") def install_requirements(): # Install demucs separately with --no-deps to avoid its outdated # torchaudio<2.2 constraint conflicting with whisperx's torchaudio>=2.5.1. # demucs works fine with torchaudio 2.6.0 at runtime. console.print(Panel(t("Installing demucs (--no-deps to avoid torchaudio conflict)..."), style="cyan")) subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-deps", "demucs[dev]@git+https://github.com/adefossez/demucs"]) # demucs --no-deps skips its own dependencies; install the ones it # actually needs at runtime that aren't already pulled in elsewhere. console.print(Panel(t("Installing demucs runtime dependencies..."), style="cyan")) subprocess.check_call([sys.executable, "-m", "pip", "install", "dora-search", "openunmix", "lameenc"]) console.print(Panel(t("Installing project in editable mode using `pip install -e .`"), style="cyan")) subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "."], env={**os.environ, "PIP_NO_CACHE_DIR": "0", "PYTHONIOENCODING": "utf-8"}) @except_handler("Failed to install Noto fonts") def install_noto_font(): # Detect Linux distribution type if os.path.exists('/etc/debian_version'): # Debian/Ubuntu systems cmd = ['sudo', 'apt-get', 'install', '-y', 'fonts-noto'] pkg_manager = "apt-get" elif os.path.exists('/etc/redhat-release'): # RHEL/CentOS/Fedora systems cmd = ['sudo', 'yum', 'install', '-y', 'google-noto*'] pkg_manager = "yum" else: console.print("Warning: Unrecognized Linux distribution, please install Noto fonts manually", style="yellow") return subprocess.run(cmd, check=True) console.print(f"✅ Successfully installed Noto fonts using {pkg_manager}", style="green") if platform.system() == 'Linux': install_noto_font() install_requirements() check_ffmpeg() # First panel with installation complete and startup command panel1_text = ( t("Installation completed") + "\n\n" + t("Now I will run this command to start the application:") + "\n" + "[bold]streamlit run st.py[/bold]\n" + t("Note: First startup may take up to 1 minute") ) console.print(Panel(panel1_text, style="bold green")) # Second panel with troubleshooting tips panel2_text = ( t("If the application fails to start:") + "\n" + "1. " + t("Check your network connection") + "\n" + "2. " + t("Re-run the installer: [bold]python install.py[/bold]") ) console.print(Panel(panel2_text, style="yellow")) # start the application subprocess.Popen(["streamlit", "run", "st.py"]) if __name__ == "__main__": main() ================================================ FILE: launch.py ================================================ """VideoLingo Enhanced Launcher - Pre-flight checks + logging.""" import subprocess, sys, os, shutil, socket from pathlib import Path from datetime import datetime SCRIPT_DIR = Path(__file__).resolve().parent LOG_DIR = SCRIPT_DIR / "logs" LOG_DIR.mkdir(exist_ok=True) LOG_FILE = LOG_DIR / f"startup_{datetime.now():%Y%m%d_%H%M%S}.log" def log(msg): line = f"[{datetime.now():%H:%M:%S}] {msg}" with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(line + "\n") def check_package(name, import_name=None): import_name = import_name or name try: mod = __import__(import_name) return getattr(mod, "__version__", "ok") except ImportError: return None def main(): errors = [] warnings = [] # Python log(f"Python: {sys.version.split()[0]} ({sys.executable})") # Packages for pkg, imp in [("streamlit", None), ("json_repair", "json_repair")]: if not check_package(pkg, imp): errors.append(f"{pkg} not installed. Run: python install.py") # torch + CUDA torch_ver = check_package("torch") if torch_ver: import torch if torch.cuda.is_available(): log(f"torch: {torch_ver}, cuda: {torch.version.cuda}, gpu: {torch.cuda.get_device_name(0)}") else: warnings.append("torch has no CUDA support. GPU disabled. Reinstall: python install.py") log(f"torch: {torch_ver} (CPU only)") if not check_package("whisperx"): warnings.append("whisperx not installed. ASR will fail.") # ffmpeg if not shutil.which("ffmpeg"): errors.append("ffmpeg not found in PATH. Install: choco install ffmpeg") # Port with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if s.connect_ex(("127.0.0.1", 8501)) == 0: warnings.append("Port 8501 in use. Close other app or use --server.port 8502") # Log everything for w in warnings: log(f"[WARN] {w}") for e in errors: log(f"[ERROR] {e}") # Show problems if any, otherwise stay quiet if errors: print() for e in errors: print(f" [ERROR] {e}") print(f"\n Fix errors above. Log: {LOG_FILE}\n") sys.exit(1) if warnings: print() for w in warnings: print(f" [WARN] {w}") print() # Launch log("Launching Streamlit...") os.environ["PYTHONWARNINGS"] = "ignore" try: proc = subprocess.run( [sys.executable, "-m", "streamlit", "run", "st.py", "--logger.level", "error"], cwd=str(SCRIPT_DIR), ) if proc.returncode != 0: log(f"Streamlit exited with code {proc.returncode}") print(f"\n Streamlit crashed (code {proc.returncode}). See: {LOG_FILE}\n") sys.exit(proc.returncode) except KeyboardInterrupt: log("Stopped by user") if __name__ == "__main__": main() ================================================ FILE: requirements.txt ================================================ librosa==0.11.0 pytorch-lightning==2.6.1 lightning==2.6.1 transformers>=4.48.0 moviepy==1.0.3 numpy>=2.0.2 openai>=1.55.3,<2 opencv-python==4.11.0.86 openpyxl==3.1.5 pandas>=2.2.3 pydub==0.25.1 PyYAML==6.0.3 replicate==0.33.0 requests==2.32.5 resampy==0.4.3 spacy==3.8.11 streamlit==1.49.1 yt-dlp json-repair ruamel.yaml InquirerPy autocorrect-py ctranslate2>=4.5.0 edge-tts pyannote-audio>=4.0.0 # demucs and whisperx are installed separately in install.py # to avoid torchaudio version conflicts between demucs (<2.2) and whisperx (~=2.8.0). # Both work fine with torchaudio 2.8+ at runtime (tested up to 2.8.0+cu128). whisperx>=3.8.1 syllables pypinyin g2p-en xmltodict ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages NAME = 'VideoLingo' VERSION = '3.0.0' with open('requirements.txt', encoding='utf-8') as f: requirements = f.read().splitlines() setup( name=NAME, version=VERSION, packages=find_packages(include=[NAME, f'{NAME}.*']), install_requires=requirements ) ================================================ FILE: st.py ================================================ import streamlit as st import os, sys from core.st_utils.imports_and_utils import * from core import * # SET PATH current_dir = os.path.dirname(os.path.abspath(__file__)) os.environ['PATH'] += os.pathsep + current_dir sys.path.append(os.path.dirname(os.path.abspath(__file__))) st.set_page_config(page_title="VideoLingo", page_icon="docs/logo.svg") SUB_VIDEO = "output/output_sub.mp4" DUB_VIDEO = "output/output_dub.mp4" def text_processing_section(): st.header(t("b. Translate and Generate Subtitles")) with st.container(border=True): st.markdown(f"""

{t("This stage includes the following steps:")}

1. {t("WhisperX word-level transcription")}
2. {t("Sentence segmentation using NLP and LLM")}
3. {t("Summarization and multi-step translation")}
4. {t("Cutting and aligning long subtitles")}
5. {t("Generating timeline and subtitles")}
6. {t("Merging subtitles into the video")} """, unsafe_allow_html=True) if not os.path.exists(SUB_VIDEO): if st.button(t("Start Processing Subtitles"), key="text_processing_button"): process_text() st.rerun() else: if load_key("burn_subtitles"): st.video(SUB_VIDEO) download_subtitle_zip_button(text=t("Download All Srt Files")) if st.button(t("Archive to 'history'"), key="cleanup_in_text_processing"): cleanup() st.rerun() return True def process_text(): with st.spinner(t("Using Whisper for transcription...")): _2_asr.transcribe() with st.spinner(t("Splitting long sentences...")): _3_1_split_nlp.split_by_spacy() _3_2_split_meaning.split_sentences_by_meaning() with st.spinner(t("Summarizing and translating...")): _4_1_summarize.get_summary() if load_key("pause_before_translate"): input(t("⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...")) _4_2_translate.translate_all() with st.spinner(t("Processing and aligning subtitles...")): _5_split_sub.split_for_sub_main() _6_gen_sub.align_timestamp_main() with st.spinner(t("Merging subtitles to video...")): _7_sub_into_vid.merge_subtitles_to_video() st.success(t("Subtitle processing complete! 🎉")) st.balloons() def audio_processing_section(): st.header(t("c. Dubbing")) with st.container(border=True): st.markdown(f"""

{t("This stage includes the following steps:")}

1. {t("Generate audio tasks and chunks")}
2. {t("Extract reference audio")}
3. {t("Generate and merge audio files")}
4. {t("Merge final audio into video")} """, unsafe_allow_html=True) if not os.path.exists(DUB_VIDEO): if st.button(t("Start Audio Processing"), key="audio_processing_button"): process_audio() st.rerun() else: st.success(t("Audio processing is complete! You can check the audio files in the `output` folder.")) if load_key("burn_subtitles"): st.video(DUB_VIDEO) if st.button(t("Delete dubbing files"), key="delete_dubbing_files"): delete_dubbing_files() st.rerun() if st.button(t("Archive to 'history'"), key="cleanup_in_audio_processing"): cleanup() st.rerun() def process_audio(): with st.spinner(t("Generate audio tasks")): _8_1_audio_task.gen_audio_task_main() _8_2_dub_chunks.gen_dub_chunks() with st.spinner(t("Extract refer audio")): _9_refer_audio.extract_refer_audio_main() with st.spinner(t("Generate all audio")): _10_gen_audio.gen_audio() with st.spinner(t("Merge full audio")): _11_merge_audio.merge_full_audio() with st.spinner(t("Merge dubbing to the video")): _12_dub_to_vid.merge_video_audio() st.success(t("Audio processing complete! 🎇")) st.balloons() def main(): logo_col, _ = st.columns([1,1]) with logo_col: st.image("docs/logo.png", width="stretch") st.markdown(button_style, unsafe_allow_html=True) welcome_text = t("Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!") st.markdown(f"

{welcome_text}

", unsafe_allow_html=True) # add settings with st.sidebar: page_setting() st.markdown(give_star_button, unsafe_allow_html=True) download_video_section() text_processing_section() audio_processing_section() if __name__ == "__main__": main() ================================================ FILE: translations/README.es.md ================================================
VideoLingo Logo # Conectando el Mundo, Cuadro por Cuadro Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Descripción General ([¡Prueba VL Gratis!](https://videolingo.io)) VideoLingo es una herramienta todo en uno para traducción, localización y doblaje de videos, diseñada para generar subtítulos de calidad Netflix. Elimina las traducciones mecánicas y los subtítulos de múltiples líneas mientras agrega doblaje de alta calidad, permitiendo compartir conocimiento globalmente a través de las barreras del idioma. Características principales: - 🎥 Descarga de videos de YouTube mediante yt-dlp - **🎙️ Reconocimiento de subtítulos a nivel de palabra y baja ilusión con WhisperX** - **📝 Segmentación de subtítulos impulsada por NLP e IA** - **📚 Terminología personalizada + generada por IA para una traducción coherente** - **🔄 Proceso de 3 pasos Traducción-Reflexión-Adaptación para calidad cinematográfica** - **✅ Solo subtítulos de una línea, estándar Netflix** - **🗣️ Doblaje con GPT-SoVITS, Azure, OpenAI y más** - 🚀 Inicio y procesamiento con un clic en Streamlit - 🌍 Soporte multilingüe en la interfaz de Streamlit - 📝 Registro detallado con reanudación de progreso Diferencia con proyectos similares: **Solo subtítulos de una línea, calidad superior de traducción, experiencia de doblaje perfecta** ## 🎥 Demo
### Subtítulos Duales --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Clonación de Voz Cosy2 --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS con mi voz --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Soporte de Idiomas **Soporte de idiomas de entrada (más por venir):** 🇺🇸 Inglés 🤩 | 🇷🇺 Ruso 😊 | 🇫🇷 Francés 🤩 | 🇩🇪 Alemán 🤩 | 🇮🇹 Italiano 🤩 | 🇪🇸 Español 🤩 | 🇯🇵 Japonés 😐 | 🇨🇳 Chino* 😊 > *El chino utiliza un modelo whisper mejorado con puntuación por ahora... **La traducción admite todos los idiomas, mientras que el idioma del doblaje depende del método TTS elegido.** ## Instalación ¿Tienes algún problema? Chatea con nuestro agente de IA en línea gratuito [**aquí**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) para ayudarte. > **Nota:** Para usuarios de Windows con GPU NVIDIA, sigue estos pasos antes de la instalación: > 1. Instala [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. Instala [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Agrega `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` a tu PATH del sistema > 4. Reinicia tu computadora > **Nota:** Se requiere FFmpeg. Por favor, instálalo a través de gestores de paquetes: > - Windows: ```choco install ffmpeg``` (vía [Chocolatey](https://chocolatey.org/)) > - macOS: ```brew install ffmpeg``` (vía [Homebrew](https://brew.sh/)) > - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu) 1. Clona el repositorio ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Instala las dependencias (requiere `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. Inicia la aplicación ```bash streamlit run st.py ``` ### Docker Alternativamente, puedes usar Docker (requiere CUDA 12.4 y versión del controlador NVIDIA >550), consulta la [documentación de Docker](/docs/pages/docs/docker.en-US.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## APIs VideoLingo admite formato de API similar a OpenAI y varias interfaces TTS: - LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (ordenados por rendimiento, ten cuidado con gemini-2.5-flash...) - WhisperX: Ejecuta whisperX localmente o usa la API de 302.ai - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(¡Puedes modificar tu propio TTS en custom_tts.py!) > **Nota:** VideoLingo funciona con **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - una clave API para todos los servicios (LLM, WhisperX, TTS). ¡O ejecútalo localmente con Ollama y Edge-TTS gratis, sin necesidad de API! Para instrucciones detalladas de instalación, configuración de API y modo por lotes, consulta la documentación: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## Limitaciones Actuales 1. El rendimiento de transcripción de WhisperX puede verse afectado por el ruido de fondo del video, ya que utiliza el modelo wav2vac para la alineación. Para videos con música de fondo fuerte, activa la Mejora de Separación de Voz. Además, los subtítulos que terminan con números o caracteres especiales pueden truncarse temprano debido a la incapacidad de wav2vac para mapear caracteres numéricos (por ejemplo, "1") a su forma hablada ("uno"). 2. El uso de modelos más débiles puede provocar errores durante los procesos intermedios debido a los estrictos requisitos de formato JSON para las respuestas. Si ocurre este error, elimina la carpeta `output` y vuelve a intentarlo con un LLM diferente, de lo contrario, la ejecución repetida leerá la respuesta errónea anterior causando el mismo error. 3. La función de doblaje puede no ser 100% perfecta debido a las diferencias en las velocidades de habla y entonación entre idiomas, así como al impacto del paso de traducción. Sin embargo, este proyecto ha implementado un extenso procesamiento de ingeniería para las velocidades de habla para garantizar los mejores resultados posibles de doblaje. 4. **El reconocimiento de transcripción de video multilingüe solo mantendrá el idioma principal**. Esto se debe a que whisperX utiliza un modelo especializado para un solo idioma al alinear forzosamente los subtítulos a nivel de palabra, y eliminará los idiomas no reconocidos. 5. **No se pueden doblar múltiples personajes por separado**, ya que la capacidad de distinción de hablantes de whisperX no es suficientemente confiable. ## 📄 Licencia Este proyecto está licenciado bajo la Licencia Apache 2.0. Un agradecimiento especial a los siguientes proyectos de código abierto por sus contribuciones: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 Contáctame - Envía [Issues](https://github.com/Huanshere/VideoLingo/issues) o [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) en GitHub - Envíame un DM en Twitter: [@Huanshere](https://twitter.com/Huanshere) - Envíame un correo a: team@videolingo.io ## ⭐ Historial de Estrellas [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

Si encuentras útil VideoLingo, ¡por favor dame una ⭐️!

================================================ FILE: translations/README.fr.md ================================================
VideoLingo Logo # Connecter le Monde, Image par Image Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Aperçu ([Essayez VL maintenant !](https://videolingo.io)) VideoLingo est un outil tout-en-un de traduction, de localisation et de doublage vidéo visant à générer des sous-titres de qualité Netflix. Il élimine les traductions automatiques rigides et les sous-titres multi-lignes tout en ajoutant un doublage de haute qualité, permettant le partage des connaissances à l'échelle mondiale au-delà des barrières linguistiques. Fonctionnalités principales : - 🎥 Téléchargement de vidéos YouTube via yt-dlp - **🎙️ Reconnaissance de sous-titres au niveau des mots et à faible illusion avec WhisperX** - **📝 Segmentation des sous-titres basée sur le NLP et l'IA** - **📚 Terminologie personnalisée + générée par IA pour une traduction cohérente** - **🔄 Processus en 3 étapes : Traduction-Réflexion-Adaptation pour une qualité cinématographique** - **✅ Sous-titres uniquement sur une ligne, aux normes Netflix** - **🗣️ Doublage avec GPT-SoVITS, Azure, OpenAI et plus** - 🚀 Démarrage et traitement en un clic dans Streamlit - 🌍 Support multi-langues dans l'interface utilisateur Streamlit - 📝 Journalisation détaillée avec reprise de la progression Différence par rapport aux projets similaires : **Sous-titres sur une seule ligne uniquement, qualité de traduction supérieure, expérience de doublage transparente** ## 🎥 Démo
### Sous-titres Doubles --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Clonage Vocal Cosy2 --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS avec ma voix --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Support des langues **Support des langues d'entrée (d'autres à venir) :** 🇺🇸 Anglais 🤩 | 🇷🇺 Russe 😊 | 🇫🇷 Français 🤩 | 🇩🇪 Allemand 🤩 | 🇮🇹 Italien 🤩 | 🇪🇸 Espagnol 🤩 | 🇯🇵 Japonais 😐 | 🇨🇳 Chinois* 😊 > *Le chinois utilise un modèle whisper séparé amélioré par la ponctuation, pour l'instant... **La traduction prend en charge toutes les langues, tandis que la langue de doublage dépend de la méthode TTS choisie.** ## Installation Vous rencontrez un problème ? Discutez avec notre agent IA gratuit en ligne [**ici**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) pour vous aider. > **Note :** Pour les utilisateurs Windows avec un GPU NVIDIA, suivez ces étapes avant l'installation : > 1. Installez [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. Installez [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Ajoutez `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` à votre PATH système > 4. Redémarrez votre ordinateur > **Note :** FFmpeg est requis. Veuillez l'installer via les gestionnaires de paquets : > - Windows : ```choco install ffmpeg``` (via [Chocolatey](https://chocolatey.org/)) > - macOS : ```brew install ffmpeg``` (via [Homebrew](https://brew.sh/)) > - Linux : ```sudo apt install ffmpeg``` (Debian/Ubuntu) 1. Clonez le dépôt ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Installez les dépendances (nécessite `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. Démarrer l'application ```bash streamlit run st.py ``` ### Docker Alternativement, vous pouvez utiliser Docker (nécessite CUDA 12.4 et NVIDIA Driver version >550), voir [Documentation Docker](/docs/pages/docs/docker.en-US.md) : ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## APIs VideoLingo prend en charge le format d'API OpenAI et diverses interfaces TTS : - LLM : `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (triés par performance, soyez prudent avec gemini-2.5-flash...) - WhisperX : Exécutez whisperX localement ou utilisez l'API 302.ai - TTS : `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(Vous pouvez modifier votre propre TTS dans custom_tts.py !) > **Note :** VideoLingo fonctionne avec **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - une seule clé API pour tous les services (LLM, WhisperX, TTS). Ou exécutez localement avec Ollama et Edge-TTS gratuitement, sans API nécessaire ! Pour des instructions détaillées sur l'installation, la configuration de l'API et le mode batch, veuillez consulter la documentation : [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## Limitations actuelles 1. Les performances de transcription de WhisperX peuvent être affectées par le bruit de fond de la vidéo, car il utilise le modèle wav2vac pour l'alignement. Pour les vidéos avec une musique de fond forte, veuillez activer l'amélioration de la séparation vocale. De plus, les sous-titres se terminant par des chiffres ou des caractères spéciaux peuvent être tronqués prématurément en raison de l'incapacité de wav2vac à mapper les caractères numériques (par exemple, "1") à leur forme parlée ("un"). 2. L'utilisation de modèles plus faibles peut entraîner des erreurs lors des processus intermédiaires en raison des exigences strictes de format JSON pour les réponses. Si cette erreur se produit, veuillez supprimer le dossier `output` et réessayer avec un LLM différent, sinon l'exécution répétée lira la réponse erronée précédente causant la même erreur. 3. La fonction de doublage peut ne pas être parfaite à 100% en raison des différences de débit et d'intonation entre les langues, ainsi que de l'impact de l'étape de traduction. Cependant, ce projet a mis en œuvre un traitement d'ingénierie extensif pour les débits de parole afin d'assurer les meilleurs résultats de doublage possibles. 4. **La reconnaissance de transcription vidéo multilingue ne conservera que la langue principale**. C'est parce que whisperX utilise un modèle spécialisé pour une seule langue lors de l'alignement forcé des sous-titres au niveau des mots, et supprimera les langues non reconnues. 5. **Impossible de doubler séparément plusieurs personnages**, car la capacité de distinction des locuteurs de whisperX n'est pas suffisamment fiable. ## 📄 Licence Ce projet est sous licence Apache 2.0. Remerciements spéciaux aux projets open source suivants pour leurs contributions : [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 Contactez-moi - Soumettez des [Issues](https://github.com/Huanshere/VideoLingo/issues) ou des [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) sur GitHub - DM moi sur Twitter : [@Huanshere](https://twitter.com/Huanshere) - Envoyez-moi un email à : team@videolingo.io ## ⭐ Historique des étoiles [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

Si vous trouvez VideoLingo utile, donnez-moi une ⭐️ !

================================================ FILE: translations/README.ja.md ================================================
VideoLingo Logo # フレームごとに世界をつなぐ Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 概要 ([VLを試す!](https://videolingo.io)) VideoLingoは、Netflixクオリティの字幕を生成することを目的とした、オールインワンの動画翻訳、ローカライゼーション、吹き替えツールです。機械的な翻訳や複数行の字幕を排除し、高品質な吹き替えを追加することで、言語の壁を越えた世界的な知識共有を可能にします。 主な機能: - 🎥 yt-dlpによるYouTube動画のダウンロード - **🎙️ WhisperXによる単語レベルの低誤認識字幕認識** - **📝 NLPとAIを活用した字幕セグメンテーション** - **📚 一貫性のある翻訳のためのカスタム+AI生成用語** - **🔄 映画品質のための3ステップ(翻訳-反映-適応)プロセス** - **✅ Netflixスタンダードの1行字幕のみ** - **🗣️ GPT-SoVITS、Azure、OpenAIなどによる吹き替え** - 🚀 Streamlitでのワンクリック起動と処理 - 🌍 Streamlit UIの多言語サポート - 📝 進捗再開機能付きの詳細なログ記録 類似プロジェクトとの違い:**1行字幕のみ、優れた翻訳品質、シームレスな吹き替え体験** ## 🎥 デモ
### デュアル字幕 --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Cosy2 ボイスクローン --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS 吹き替え --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### 言語サポート **入力言語サポート(今後追加予定):** 🇺🇸 英語 🤩 | 🇷🇺 ロシア語 😊 | 🇫🇷 フランス語 🤩 | 🇩🇪 ドイツ語 🤩 | 🇮🇹 イタリア語 🤩 | 🇪🇸 スペイン語 🤩 | 🇯🇵 日本語 😐 | 🇨🇳 中国語* 😊 > *中国語は現在、句読点強化されたwhisperモデルを使用しています。 **翻訳はすべての言語に対応していますが、吹き替えの言語は選択したTTS方式によって異なります。** ## インストール 問題がありましたか?無料のオンラインAIエージェントと[**こちら**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh)でチャットして支援を受けられます。 > **注意:** NVIDIA GPUを搭載したWindowsユーザーは、インストール前に以下の手順を実行してください: > 1. [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)をインストール > 2. [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)をインストール > 3. `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6`をシステムPATHに追加 > 4. コンピュータを再起動 > **注意:** FFmpegが必要です。パッケージマネージャーを使用してインストールしてください: > - Windows: ```choco install ffmpeg``` ([Chocolatey](https://chocolatey.org/)経由) > - macOS: ```brew install ffmpeg``` ([Homebrew](https://brew.sh/)経由) > - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu) 1. リポジトリをクローン ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. 依存関係のインストール(`python=3.10`が必要) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. アプリケーションの起動 ```bash streamlit run st.py ``` ### Docker または、Docker(CUDA 12.4とNVIDIAドライバーバージョン>550が必要)を使用することもできます。[Dockerドキュメント](/docs/pages/docs/docker.en-US.md)を参照してください: ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## API VideoLingoはOpenAIライクなAPI形式と様々なTTSインターフェースをサポートしています: - LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (パフォーマンス順、gemini-2.5-flashには注意...) - WhisperX: ローカルでwhisperXを実行するか302.ai APIを使用 - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(custom_tts.pyで独自のTTSを修正可能!) > **注意:** VideoLingoは**[302.ai](https://gpt302.saaslink.net/C2oHR9)**と連携しています - すべてのサービス(LLM、WhisperX、TTS)に1つのAPIキーで対応。またはOllamaとEdge-TTSを使用してローカルで無料で実行可能で、APIは不要です! 詳細なインストール方法、API設定、バッチモードの説明については、ドキュメントを参照してください:[English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## 現在の制限事項 1. WhisperX文字起こしのパフォーマンスは、アライメントにwav2vacモデルを使用しているため、動画の背景ノイズの影響を受ける可能性があります。大きな背景音楽がある動画の場合は、音声分離強化を有効にしてください。また、wav2vacが数字文字(例:「1」)を発話形式(「one」)にマッピングできないため、数字や特殊文字で終わる字幕は早期に切り捨てられる可能性があります。 2. より弱いモデルを使用すると、レスポンスに厳密なJSON形式が要求されるため、中間プロセスでエラーが発生する可能性があります。このエラーが発生した場合は、`output`フォルダを削除して別のLLMで再試行してください。そうしないと、繰り返し実行時に前回の誤ったレスポンスを読み込んで同じエラーが発生します。 3. 吹き替え機能は、言語間の発話速度やイントネーションの違い、および翻訳ステップの影響により、100%完璧ではない可能性があります。ただし、このプロジェクトでは発話速度に関する広範なエンジニアリング処理を実装し、可能な限り最高の吹き替え結果を確保しています。 4. **多言語ビデオの文字起こし認識は主要言語のみを保持します**。これは、whisperXが単語レベルの字幕を強制的にアライメントする際に単一言語用の特殊モデルを使用し、認識されない言語を削除するためです。 5. **複数のキャラクターを個別に吹き替えることはできません**。これは、whisperXの話者区別機能が十分に信頼できないためです。 ## 📄 ライセンス このプロジェクトはApache 2.0ライセンスの下で提供されています。以下のオープンソースプロジェクトの貢献に特別な感謝を表します: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 お問い合わせ - GitHubで[Issues](https://github.com/Huanshere/VideoLingo/issues)や[Pull Requests](https://github.com/Huanshere/VideoLingo/pulls)を提出 - Twitter: [@Huanshere](https://twitter.com/Huanshere)でDM - メール: team@videolingo.io ## ⭐ スター履歴 [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

VideoLingoが役立つと感じた場合は、⭐️をお願いします!

================================================ FILE: translations/README.ru.md ================================================
VideoLingo Logo # Объединяя Мир, Кадр за Кадром Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Обзор ([Попробуйте VL бесплатно!](https://videolingo.io)) VideoLingo - это универсальный инструмент для перевода, локализации и дубляжа видео, направленный на создание субтитров качества Netflix. Он устраняет механические переводы и многострочные субтитры, добавляя высококачественный дубляж, что позволяет делиться знаниями по всему миру, преодолевая языковые барьеры. Ключевые особенности: - 🎥 Загрузка видео с YouTube через yt-dlp - **🎙️ Пословное распознавание субтитров с низким уровнем искажений с помощью WhisperX** - **📝 Сегментация субтитров на основе NLP и ИИ** - **📚 Пользовательская + ИИ-генерируемая терминология для согласованного перевода** - **🔄 3-этапный процесс Перевод-Осмысление-Адаптация для кинематографического качества** - **✅ Только однострочные субтитры стандарта Netflix** - **🗣️ Дубляж с помощью GPT-SoVITS, Azure, OpenAI и других** - 🚀 Запуск и обработка в один клик в Streamlit - 🌍 Многоязычная поддержка в интерфейсе Streamlit - 📝 Подробное логирование с возможностью возобновления прогресса Отличие от похожих проектов: **Только однострочные субтитры, превосходное качество перевода, безупречный опыт дубляжа** ## 🎥 Демонстрация
### Двойные Субтитры --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Клонирование Голоса Cosy2 --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS с моим голосом --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Поддержка языков **Поддержка входных языков (будет добавлено больше):** 🇺🇸 Английский 🤩 | 🇷🇺 Русский 😊 | 🇫🇷 Французский 🤩 | 🇩🇪 Немецкий 🤩 | 🇮🇹 Итальянский 🤩 | 🇪🇸 Испанский 🤩 | 🇯🇵 Японский 😐 | 🇨🇳 Китайский* 😊 > *Китайский пока использует отдельную модель whisper с улучшенной пунктуацией... **Перевод поддерживает все языки, в то время как язык дубляжа зависит от выбранного метода TTS.** ## Установка Возникли проблемы? Общайтесь с нашим бесплатным онлайн ИИ-агентом [**здесь**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh), который поможет вам. > **Примечание:** Для пользователей Windows с GPU NVIDIA выполните следующие шаги перед установкой: > 1. Установите [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. Установите [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. Добавьте `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` в системный PATH > 4. Перезагрузите компьютер > **Примечание:** Требуется FFmpeg. Установите его через менеджеры пакетов: > - Windows: ```choco install ffmpeg``` (через [Chocolatey](https://chocolatey.org/)) > - macOS: ```brew install ffmpeg``` (через [Homebrew](https://brew.sh/)) > - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu) 1. Клонируйте репозиторий ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. Установите зависимости (требуется `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. Запустите приложение ```bash streamlit run st.py ``` ### Docker Альтернативно, вы можете использовать Docker (требуется CUDA 12.4 и версия драйвера NVIDIA >550), см. [документацию Docker](/docs/pages/docs/docker.en-US.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## API VideoLingo поддерживает формат API, подобный OpenAI, и различные интерфейсы TTS: - LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (отсортировано по производительности, будьте осторожны с gemini-2.5-flash...) - WhisperX: Запускайте whisperX локально или используйте API 302.ai - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(Вы можете модифицировать свой собственный TTS в custom_tts.py!) > **Примечание:** VideoLingo работает с **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - один API-ключ для всех сервисов (LLM, WhisperX, TTS). Или запускайте локально с Ollama и Edge-TTS бесплатно, без необходимости в API! Для подробных инструкций по установке, настройке API и пакетному режиму обратитесь к документации: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## Текущие ограничения 1. Производительность транскрипции WhisperX может быть затронута фоновым шумом видео, так как для выравнивания используется модель wav2vac. Для видео с громкой фоновой музыкой включите Улучшение разделения голоса. Кроме того, субтитры, заканчивающиеся цифрами или специальными символами, могут быть обрезаны раньше из-за неспособности wav2vac сопоставлять цифровые символы (например, "1") с их произносимой формой ("один"). 2. Использование более слабых моделей может привести к ошибкам во время промежуточных процессов из-за строгих требований к формату JSON для ответов. Если возникает эта ошибка, удалите пап ================================================ FILE: translations/README.zh-TW.md ================================================
VideoLingo Logo # 連結世界,逐格前行 Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 概述 ([立即體驗 VL!](https://videolingo.io)) VideoLingo 是一個全方位的影片翻譯、本地化和配音工具,旨在生成 Netflix 品質的字幕。它消除了機器翻譯的生硬感和多行字幕,同時提供高品質配音,實現跨越語言障礙的全球知識共享。 主要功能: - 🎥 通過 yt-dlp 下載 YouTube 影片 - **🎙️ 使用 WhisperX 進行詞級別和低幻覺字幕識別** - **📝 基於 NLP 和 AI 的字幕分段** - **📚 自定義 + AI 生成術語庫確保翻譯一致性** - **🔄 三步驟翻譯-反思-調適實現影院級品質** - **✅ Netflix 標準,僅單行字幕** - **🗣️ 使用 GPT-SoVITS、Azure、OpenAI 等進行配音** - 🚀 在 Streamlit 中一鍵啟動和處理 - 🌍 Streamlit UI 多語言支持 - 📝 詳細日誌記錄和進度恢復 與類似項目的區別:**僅單行字幕、更優質的翻譯、無縫配音體驗** ## 🎥 演示
### 雙語字幕 --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Cosy2 聲音克隆 --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS 配音 --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### 語言支持 **輸入語言支持(更多語言即將推出):** 🇺🇸 英語 🤩 | 🇷🇺 俄語 😊 | 🇫🇷 法語 🤩 | 🇩🇪 德語 🤩 | 🇮🇹 義大利語 🤩 | 🇪🇸 西班牙語 🤩 | 🇯🇵 日語 😐 | 🇨🇳 中文* 😊 > *中文目前使用單獨的標點增強版 whisper 模型... **翻譯支持所有語言,配音語言則取決於所選的 TTS 方法。** ## 安裝 遇到任何問題?在[**這裡**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh)與我們的免費在線 AI 助手聊天以獲取幫助。 > **注意:** Windows 用戶如使用 NVIDIA GPU,請在安裝前執行以下步驟: > 1. 安裝 [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. 安裝 [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. 將 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加到系統 PATH > 4. 重啟電腦 > **注意:** 需要安裝 FFmpeg。請通過包管理器安裝: > - Windows:```choco install ffmpeg```(通過 [Chocolatey](https://chocolatey.org/)) > - macOS:```brew install ffmpeg```(通過 [Homebrew](https://brew.sh/)) > - Linux:```sudo apt install ffmpeg```(Debian/Ubuntu) 1. 克隆倉庫 ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. 安裝依賴(需要 `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. 啟動應用 ```bash streamlit run st.py ``` ### Docker 或者,您可以使用 Docker(需要 CUDA 12.4 和 NVIDIA 驅動版本 >550),參見 [Docker 文檔](/docs/pages/docs/docker.en-US.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## APIs VideoLingo 支持 OpenAI 格式的 API 和各種 TTS 接口: - LLM:`claude-3-5-sonnet`、`gpt-4.1`、`deepseek-v3`、`gemini-2.0-flash`、...(按性能排序,使用 gemini-2.5-flash 時請謹慎...) - WhisperX:本地運行 whisperX 或使用 302.ai API - TTS:`azure-tts`、`openai-tts`、`siliconflow-fishtts`、**`fish-tts`**、`GPT-SoVITS`、`edge-tts`、`*custom-tts`(您可以在 custom_tts.py 中修改自己的 TTS!) > **注意:** VideoLingo 與 **[302.ai](https://gpt302.saaslink.net/C2oHR9)** 合作 - 一個 API 密鑰即可使用所有服務(LLM、WhisperX、TTS)。或者使用 Ollama 和 Edge-TTS 在本地免費運行,無需 API! 詳細安裝、API 配置和批處理模式說明,請參閱文檔:[English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md) ## 當前限制 1. WhisperX 轉錄性能可能受到視頻背景噪音影響,因為它使用 wav2vac 模型進行對齊。對於有大量背景音樂的視頻,請啟用語音分離增強。此外,由於 wav2vac 無法將數字字符(如"1")映射到其口語形式("one"),以數字或特殊字符結尾的字幕可能會提前截斷。 2. 使用較弱的模型可能會由於對響應的嚴格 JSON 格式要求而在中間過程中出錯。如果出現此錯誤,請刪除 `output` 文件夾並使用不同的 LLM 重試,否則重複執行將讀取先前的錯誤響應導致相同錯誤。 3. 由於語言之間的語速和語調差異,以及翻譯步驟的影響,配音功能可能無法 100% 完美。但是,本項目已經對語速進行了大量工程處理,以確保最佳的配音效果。 4. **多語言視頻轉錄識別將只保留主要語言**。這是因為 whisperX 在強制對齊詞級字幕時使用單一語言的專用模型,並會刪除無法識別的語言。 5. **無法分別為多個角色配音**,因為 whisperX 的說話人區分能力尚不夠可靠。 ## 📄 許可證 本項目採用 Apache 2.0 許可證。特別感謝以下開源項目的貢獻: [whisperX](https://github.com/m-bain/whisperX)、[yt-dlp](https://github.com/yt-dlp/yt-dlp)、[json_repair](https://github.com/mangiucugna/json_repair)、[BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 聯繫我 - 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) - 在 Twitter 上私信我:[@Huanshere](https://twitter.com/Huanshere) - 發送郵件至:team@videolingo.io ## ⭐ Star 歷史 [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ---

如果您覺得 VideoLingo 有幫助,請給我一個 ⭐️!

================================================ FILE: translations/README.zh.md ================================================
VideoLingo Logo # 连接世界每一帧 Huanshere%2FVideoLingo | Trendshift [**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md) **QQ群:875297969**
## 🌟 简介([在线体验!](https://videolingo.io)) VideoLingo 是一站式视频翻译本地化配音工具,能够一键生成 Netflix 级别的高质量字幕,告别生硬机翻,告别多行字幕,还能加上高质量的克隆配音,让全世界的知识能够跨越语言的障碍共享。 主要特点和功能: - 🎥 使用 yt-dlp 从 Youtube 链接下载视频 - **🎙️ 使用 WhisperX 进行单词级和低幻觉字幕识别** - **📝 使用 NLP 和 AI 进行字幕分割** - **📚 自定义 + AI 生成术语库,保证翻译连贯性** - **🔄 三步直译、反思、意译,实现影视级翻译质量** - **✅ 按照 Netflix 标准检查单行长度,绝无双行字幕** - **🗣️ 支持 GPT-SoVITS、Azure、OpenAI 等多种配音方案** - 🚀 一键启动,在 streamlit 中一键出片 - 🌍 多语言支持就绪的 streamlit UI - 📝 详细记录每步操作日志,支持随时中断和恢复进度 与同类项目相比的优势:**绝无多行字幕,最佳的翻译质量,无缝的配音体验** ## 🎥 演示
### 双语字幕 --- https://github.com/user-attachments/assets/a5c3d8d1-2b29-4ba9-b0d0-25896829d951 ### Cosy2 声音克隆 --- https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a ### GPT-SoVITS 配音 --- https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### 语言支持 **输入语言支持:** 🇺🇸 英语 🤩 | 🇷🇺 俄语 😊 | 🇫🇷 法语 🤩 | 🇩🇪 德语 🤩 | 🇮🇹 意大利语 🤩 | 🇪🇸 西班牙语 🤩 | 🇯🇵 日语 😐 | 🇨🇳 中文* 😊 > *中文使用单独的标点增强后的 whisper 模型 **翻译语言支持所有语言,配音语言取决于选取的TTS。** ## 安装 遇到问题?在[**这里**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh)与我们的免费在线AI助手交流获取帮助。 > **注意:** 在 Windows 上使用 NVIDIA GPU 加速需要先完成以下步骤: > 1. 安装 [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) > 2. 安装 [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe) > 3. 将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加到系统环境变量 PATH 中 > 4. 重启电脑 > **注意:** FFmpeg 是必需的,请通过包管理器安装: > - Windows:```choco install ffmpeg```(通过 [Chocolatey](https://chocolatey.org/)) > - macOS:```brew install ffmpeg```(通过 [Homebrew](https://brew.sh/)) > - Linux:```sudo apt install ffmpeg```(Debian/Ubuntu) 1. 克隆仓库 ```bash git clone https://github.com/Huanshere/VideoLingo.git cd VideoLingo ``` 2. 安装依赖(需要 `python=3.10`) ```bash conda create -n videolingo python=3.10.0 -y conda activate videolingo python install.py ``` 3. 启动应用 ```bash streamlit run st.py ``` ### Docker 还可以选择使用 Docker(要求 CUDA 12.4 和 NVIDIA Driver 版本 >550),详见[Docker文档](/docs/pages/docs/docker.zh-CN.md): ```bash docker build -t videolingo . docker run -d -p 8501:8501 --gpus all videolingo ``` ## API 本项目支持 OpenAI-Like 格式的 api 和多种配音接口: - LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ...(按效果排序,使用 gemini-2.5-flash 时需谨慎...) - WhisperX: 本地运行 WhisperX 或使用 302.ai API - TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(你可以在 custom_tts.py 中自定义 TTS!) > **注意:** VideoLingo 现已与 **[302.ai](https://gpt302.saaslink.net/C2oHR9)** 集成,**一个 API KEY** 即可同时支持 LLM、WhisperX 和 TTS!同时也支持完全本地部署,使用 Ollama 作为 LLM 和 Edge-TTS 作为配音,无需云端 API! 详细的安装、API 配置、批量说明可以参见文档:[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md) ## 当前限制 1. WhisperX 转录效果可能受到视频背景声影响,因为使用了 wav2vac 模型进行对齐。对于背景音乐较大的视频,请开启人声分离增强。另外,如果字幕以数字或特殊符号结尾,可能会导致提前截断,这是因为 wav2vac 无法将数字字符(如"1")映射到其发音形式("one")。 2. 使用较弱模型时容易在中间过程报错,这是因为对响应的 json 格式要求较为严格。如果出现此错误,请删除 `output` 文件夹后更换 llm 重试,否则重复执行会读取上次错误的响应导致同样错误。 3. 配音功能由于不同语言的语速和语调差异,还受到翻译步骤的影响,可能不能 100% 完美,但本项目做了非常多的语速上的工程处理,尽可能保证配音效果。 4. **多语言视频转录识别仅仅只会保留主要语言**,这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型,会因为不认识另一种语言而删去。 5. **无法多角色分别配音**,whisperX 的说话人区分效果不够好用。 ## 📄 许可证 本项目采用 Apache 2.0 许可证,衷心感谢以下开源项目的贡献: [whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE) ## 📬 联系 - 加入 QQ 群寻求解答:875297969 - 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) - 关注我的 Twitter:[@Huanshere](https://twitter.com/Huanshere) - 联系邮箱:team@videolingo.io ## ⭐ Star History [![Star History Chart](https://api.star-history.com/svg?repos=Huanshere/VideoLingo&type=Timeline)](https://star-history.com/#Huanshere/VideoLingo&Timeline) ================================================ FILE: translations/en.json ================================================ { "a. Download or Upload Video": "a. Download or Upload Video", "Delete and Reselect": "Delete and Reselect", "Enter YouTube link:": "Enter YouTube link:", "Resolution": "Resolution", "Download Video": "Download Video", "Or upload video": "Or upload video", "Youtube Settings": "Youtube Settings", "Cookies Path": "Cookies Path", "LLM Configuration": "LLM Configuration", "API_KEY": "API_KEY", "BASE_URL": "BASE_URL", "MODEL": "MODEL", "Openai format, will add /v1/chat/completions automatically": "Openai format, will add /v1/chat/completions automatically", "click to check API validity": "click to check API validity", "API Key is valid": "API Key is valid", "API Key is invalid": "API Key is invalid", "Recog Lang": "Recog Lang", "Subtitles Settings": "Subtitles Settings", "Target Lang": "Target Lang", "Input any language in natural language, as long as llm can understand": "Input any language in natural language, as long as llm can understand", "Vocal separation enhance": "Vocal separation enhance", "Burn-in Subtitles": "Burn-in Subtitles", "Whether to burn subtitles into the video, will increase processing time": "Whether to burn subtitles into the video, will increase processing time", "Video Resolution": "Video Resolution", "Recommended for videos with loud background noise, but will increase processing time": "Recommended for videos with loud background noise, but will increase processing time", "Dubbing Settings": "Dubbing Settings", "TTS Method": "TTS Method", "SiliconFlow API Key": "SiliconFlow API Key", "Mode Selection": "Mode Selection", "Preset": "Preset", "Refer_stable": "Refer_stable", "Refer_dynamic": "Refer_dynamic", "OpenAI Voice": "OpenAI Voice", "Fish TTS Character": "Fish TTS Character", "Azure Voice": "Azure Voice", "Please refer to Github homepage for GPT_SoVITS configuration": "Please refer to Github homepage for GPT_SoVITS configuration", "SoVITS Character": "SoVITS Character", "Refer Mode": "Refer Mode", "Mode 1: Use provided reference audio only": "Mode 1: Use provided reference audio only", "Mode 2: Use first audio from video as reference": "Mode 2: Use first audio from video as reference", "Mode 3: Use each audio from video as reference": "Mode 3: Use each audio from video as reference", "Configure reference audio mode for GPT-SoVITS": "Configure reference audio mode for GPT-SoVITS", "Edge TTS Voice": "Edge TTS Voice", "=====NOTE=====": "BELOW IS in st.py", "b. Translate and Generate Subtitles": "b. Translate and Generate Subtitles", "This stage includes the following steps:": "This stage includes the following steps:", "WhisperX word-level transcription": "WhisperX word-level transcription", "Sentence segmentation using NLP and LLM": "Sentence segmentation using NLP and LLM", "Summarization and multi-step translation": "Summarization and multi-step translation", "Cutting and aligning long subtitles": "Cutting and aligning long subtitles", "Generating timeline and subtitles": "Generating timeline and subtitles", "Merging subtitles into the video": "Merging subtitles into the video", "Start Processing Subtitles": "Start Processing Subtitles", "Download All Srt Files": "Download All Srt Files", "Archive to 'history'": "Archive to 'history'", "Using Whisper for transcription...": "Using Whisper for transcription...", "Splitting long sentences...": "Splitting long sentences...", "Summarizing and translating...": "Summarizing and translating...", "Processing and aligning subtitles...": "Processing and aligning subtitles...", "Merging subtitles to video...": "Merging subtitles to video...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...", "Subtitle processing complete! 🎉": "Subtitle processing complete! 🎉", "c. Dubbing": "c. Dubbing", "Generate audio tasks and chunks": "Generate audio tasks and chunks", "Extract reference audio": "Extract reference audio", "Generate and merge audio files": "Generate and merge audio files", "Merge final audio into video": "Merge final audio into video", "Start Audio Processing": "Start Audio Processing", "Audio processing is complete! You can check the audio files in the `output` folder.": "Audio processing is complete! You can check the audio files in the `output` folder.", "Delete dubbing files": "Delete dubbing files", "Generate audio tasks": "Generate audio tasks", "Extract refer audio": "Extract refer audio", "Generate all audio": "Generate all audio", "Merge full audio": "Merge full audio", "Merge dubbing to the video": "Merge dubbing to the video", "Audio processing complete! 🎇": "Audio processing complete! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!", "WhisperX Runtime": "WhisperX Runtime", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key", "WhisperX 302ai API": "WhisperX 302ai API", "=====NOTE2=====": "BELOW IS in install.py", "🚀 Starting Installation": "🚀 Starting Installation", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.", "❌ Failed to install requirements:": "❌ Failed to install requirements:", "✅ FFmpeg is already installed": "✅ FFmpeg is already installed", "❌ FFmpeg not found\n\n": "❌ FFmpeg not found\n\n", "🛠️ Install using:": "🛠️ Install using:", "💡 Note:": "💡 Note:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 After installing FFmpeg, please run this installer again:", "Install Chocolatey first (https://chocolatey.org/)": "Install Chocolatey first (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "Install Homebrew first (https://brew.sh/)", "Use your distribution's package manager": "Use your distribution's package manager", "FFmpeg is required. Please install it and run the installer again.": "FFmpeg is required. Please install it and run the installer again.", "Installing requirements using `pip install -r requirements.txt`": "Installing requirements using `pip install -r requirements.txt`", "Installation completed": "Installation completed", "Now I will run this command to start the application:": "Now I will run this command to start the application:", "Note: First startup may take up to 1 minute": "Note: First startup may take up to 1 minute", "If the application fails to start:": "If the application fails to start:", "Check your network connection": "Check your network connection", "Re-run the installer: [bold]python install.py[/bold]": "Re-run the installer: [bold]python install.py[/bold]", "Detected NVIDIA GPU(s)": "Detected NVIDIA GPU(s)", "No NVIDIA GPU detected": "No NVIDIA GPU detected", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "No NVIDIA GPU detected or NVIDIA drivers not properly installed", "LLM JSON Format Support": "LLM JSON Format Support", "Enable if your LLM supports JSON mode output": "Enable if your LLM supports JSON mode output" } ================================================ FILE: translations/es.json ================================================ { "a. Download or Upload Video": "a. Descargar o subir video", "Delete and Reselect": "Eliminar y volver a seleccionar", "Enter YouTube link:": "Ingrese el enlace de YouTube:", "Resolution": "Resolución", "Download Video": "Descargar video", "Or upload video": "O subir video", "Youtube Settings": "Configuración de Youtube", "Cookies Path": "Ruta del archivo de Cookies", "LLM Configuration": "Configuración de LLM", "API_KEY": "Clave API", "BASE_URL": "URL base", "MODEL": "Modelo", "Openai format, will add /v1/chat/completions automatically": "Formato OpenAI, se agregará /v1/chat/completions automáticamente", "click to check API validity": "haga clic para verificar la validez de la API", "API Key is valid": "La clave API es válida", "API Key is invalid": "La clave API no es válida", "Recog Lang": "Idioma de reconocimiento", "Subtitles Settings": "Configuración de subtítulos", "Target Lang": "Idioma objetivo", "Input any language in natural language, as long as llm can understand": "Ingrese cualquier idioma en lenguaje natural, siempre que LLM pueda entenderlo", "Vocal separation enhance": "Mejora de separación vocal", "Burn-in Subtitles": "Incrustar subtítulos", "Whether to burn subtitles into the video, will increase processing time": "Si se deben incrustar los subtítulos en el video, aumentará el tiempo de procesamiento", "Video Resolution": "Resolución de video", "Recommended for videos with loud background noise, but will increase processing time": "Recomendado para videos con ruido de fondo fuerte, pero aumentará el tiempo de procesamiento", "Dubbing Settings": "Configuración de doblaje", "TTS Method": "Método TTS", "SiliconFlow API Key": "Clave API de SiliconFlow", "Mode Selection": "Selección de modo", "Preset": "Preestablecido", "Refer_stable": "Referencia estable", "Refer_dynamic": "Referencia dinámica", "OpenAI Voice": "Voz de OpenAI", "Fish TTS Character": "Personaje Fish TTS", "Azure Voice": "Voz de Azure", "Please refer to Github homepage for GPT_SoVITS configuration": "Consulte la página principal de Github para la configuración de GPT_SoVITS", "SoVITS Character": "Personaje SoVITS", "Refer Mode": "Modo de referencia", "Mode 1: Use provided reference audio only": "Modo 1: Usar solo el audio de referencia proporcionado", "Mode 2: Use first audio from video as reference": "Modo 2: Usar el primer audio del video como referencia", "Mode 3: Use each audio from video as reference": "Modo 3: Usar cada audio del video como referencia", "Configure reference audio mode for GPT-SoVITS": "Configurar modo de audio de referencia para GPT-SoVITS", "Edge TTS Voice": "Voz Edge TTS", "=====NOTE=====": "Lo siguiente es el contenido de st.py", "b. Translate and Generate Subtitles": "b. Traducir y generar subtítulos", "This stage includes the following steps:": "Esta etapa incluye los siguientes pasos:", "WhisperX word-level transcription": "Transcripción a nivel de palabra WhisperX", "Sentence segmentation using NLP and LLM": "Segmentación de oraciones usando NLP y LLM", "Summarization and multi-step translation": "Resumen y traducción en múltiples pasos", "Cutting and aligning long subtitles": "Cortar y alinear subtítulos largos", "Generating timeline and subtitles": "Generar línea de tiempo y subtítulos", "Merging subtitles into the video": "Fusionar subtítulos en el video", "Start Processing Subtitles": "Comenzar procesamiento de subtítulos", "Download All Srt Files": "Descargar todos los archivos Srt", "Archive to 'history'": "Archivar en 'history'", "Using Whisper for transcription...": "Usando Whisper para transcripción...", "Splitting long sentences...": "Dividiendo oraciones largas...", "Summarizing and translating...": "Resumiendo y traduciendo...", "Processing and aligning subtitles...": "Procesando y alineando subtítulos...", "Merging subtitles to video...": "Fusionando subtítulos al video...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSA_ANTES_DE_TRADUCIR. Vaya a `output/log/terminology.json` para editar la terminología. Luego presione ENTER para continuar...", "Subtitle processing complete! 🎉": "¡Procesamiento de subtítulos completado! 🎉", "c. Dubbing": "c. Doblaje", "Generate audio tasks and chunks": "Generar tareas y fragmentos de audio", "Extract reference audio": "Extraer audio de referencia", "Generate and merge audio files": "Generar y fusionar archivos de audio", "Merge final audio into video": "Fusionar audio final en el video", "Start Audio Processing": "Comenzar procesamiento de audio", "Audio processing is complete! You can check the audio files in the `output` folder.": "¡El procesamiento de audio está completo! Puede verificar los archivos de audio en la carpeta `output`.", "Delete dubbing files": "Eliminar archivos de doblaje", "Generate audio tasks": "Generar tareas de audio", "Extract refer audio": "Extraer audio de referencia", "Generate all audio": "Generar todo el audio", "Merge full audio": "Fusionar audio completo", "Merge dubbing to the video": "Fusionar doblaje al video", "Audio processing complete! 🎇": "¡Procesamiento de audio completado! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Hola, bienvenido a VideoLingo. Si encuentra algún problema, no dude en obtener respuestas instantáneas con nuestro Agente de preguntas y respuestas gratuito aquí. ¡También puede probar gratis nuestro sitio web SaaS en videolingo.io!", "WhisperX Runtime": "Entorno de WhisperX", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "El entorno local requiere GPU >8GB, el entorno en la nube requiere clave API 302ai, el entorno elevenlabs requiere clave API ElevenLabs", "WhisperX 302ai API": "API 302ai de WhisperX", "=====NOTE2=====": "A continuación está en install.py", "🚀 Starting Installation": "🚀 Iniciando instalación", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "¿Necesita configurar automáticamente los espejos PyPI? (Recomendado si tiene dificultades para acceder a pypi.org)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 GPU NVIDIA detectada, instalando versión CUDA de PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detectado, instalando versión CPU de PyTorch... Nota: puede ser lento durante la transcripción de whisperX.", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 No se detectó GPU NVIDIA, instalando versión CPU de PyTorch... Nota: puede ser lento durante la transcripción de whisperX.", "❌ Failed to install requirements:": "❌ Error al instalar los requisitos:", "✅ FFmpeg is already installed": "✅ FFmpeg ya está instalado", "❌ FFmpeg not found\n\n": "❌ FFmpeg no encontrado\n\n", "🛠️ Install using:": "🛠️ Instalar usando:", "💡 Note:": "💡 Nota:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 Después de instalar FFmpeg, ejecute este instalador nuevamente:", "Install Chocolatey first (https://chocolatey.org/)": "Instale Chocolatey primero (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "Instale Homebrew primero (https://brew.sh/)", "Use your distribution's package manager": "Use el gestor de paquetes de su distribución", "FFmpeg is required. Please install it and run the installer again.": "Se requiere FFmpeg. Por favor instálelo y ejecute el instalador nuevamente.", "Installation completed": "Instalación completada", "Now I will run this command to start the application:": "Ahora ejecutaré este comando para iniciar la aplicación:", "Note: First startup may take up to 1 minute": "Nota: El primer inicio puede tardar hasta 1 minuto", "If the application fails to start:": "Si la aplicación no se inicia:", "Check your network connection": "Compruebe su conexión de red", "Re-run the installer: [bold]python install.py[/bold]": "Vuelva a ejecutar el instalador: [bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "Instalando dependencias usando `pip install -r requirements.txt`", "Detected NVIDIA GPU(s)": "GPU(s) NVIDIA detectada(s)", "No NVIDIA GPU detected": "No se detectó GPU NVIDIA", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "No se detectó GPU NVIDIA o los controladores NVIDIA no están instalados correctamente", "LLM JSON Format Support": "Soporte de formato JSON para LLM", "Enable if your LLM supports JSON mode output": "Activar si su LLM admite salida en modo JSON" } ================================================ FILE: translations/fr.json ================================================ { "a. Download or Upload Video": "a. Télécharger ou importer une vidéo", "Delete and Reselect": "Supprimer et resélectionner", "Enter YouTube link:": "Entrez le lien YouTube :", "Resolution": "Résolution", "Download Video": "Télécharger la vidéo", "Or upload video": "Ou importer une vidéo", "Youtube Settings": "Paramètres Youtube", "Cookies Path": "Chemin du fichier Cookies", "LLM Configuration": "Configuration LLM", "API_KEY": "Clé API", "BASE_URL": "URL de base", "MODEL": "Modèle", "Openai format, will add /v1/chat/completions automatically": "Format OpenAI, /v1/chat/completions sera ajouté automatiquement", "click to check API validity": "Cliquez pour vérifier la validité de l'API", "API Key is valid": "La clé API est valide", "API Key is invalid": "La clé API n'est pas valide", "Recog Lang": "Langue de reconnaissance", "Subtitles Settings": "Paramètres des sous-titres", "Target Lang": "Langue cible", "Input any language in natural language, as long as llm can understand": "Saisissez n'importe quelle langue en langage naturel, tant que le LLM peut la comprendre", "Vocal separation enhance": "Amélioration de la séparation vocale", "Burn-in Subtitles": "Incruster les sous-titres", "Whether to burn subtitles into the video, will increase processing time": "Pour incruster les sous-titres dans la vidéo, cela augmentera le temps de traitement", "Video Resolution": "Résolution vidéo", "Recommended for videos with loud background noise, but will increase processing time": "Recommandé pour les vidéos avec beaucoup de bruit de fond, mais augmente le temps de traitement", "Dubbing Settings": "Paramètres de doublage", "TTS Method": "Méthode TTS", "SiliconFlow API Key": "Clé API SiliconFlow", "Mode Selection": "Sélection du mode", "Preset": "Préréglage", "Refer_stable": "Référence stable", "Refer_dynamic": "Référence dynamique", "OpenAI Voice": "Voix OpenAI", "Fish TTS Character": "Personnage Fish TTS", "Azure Voice": "Voix Azure", "Please refer to Github homepage for GPT_SoVITS configuration": "Veuillez consulter la page Github pour la configuration GPT_SoVITS", "SoVITS Character": "Personnage SoVITS", "Refer Mode": "Mode de référence", "Mode 1: Use provided reference audio only": "Mode 1 : Utiliser uniquement l'audio de référence fourni", "Mode 2: Use first audio from video as reference": "Mode 2 : Utiliser le premier audio de la vidéo comme référence", "Mode 3: Use each audio from video as reference": "Mode 3 : Utiliser chaque audio de la vidéo comme référence", "Configure reference audio mode for GPT-SoVITS": "Configurer le mode audio de référence pour GPT-SoVITS", "Edge TTS Voice": "Voix Edge TTS", "=====NOTE=====": "Ce qui suit est dans st.py", "=====NOTE2=====": "Ce qui suit est dans install.py", "🚀 Starting Installation": "🚀 Démarrage de l'installation", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Voulez-vous configurer automatiquement les miroirs PyPI ? (Recommandé si vous avez des difficultés à accéder à pypi.org)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 GPU NVIDIA détecté, installation de la version CUDA de PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS détecté, installation de la version CPU de PyTorch... Note : la transcription whisperX peut être lente.", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 Aucun GPU NVIDIA détecté, installation de la version CPU de PyTorch... Note : la transcription whisperX peut être lente.", "❌ Failed to install requirements:": "❌ Échec de l'installation des prérequis :", "✅ FFmpeg is already installed": "✅ FFmpeg est déjà installé", "❌ FFmpeg not found\n\n": "❌ FFmpeg non trouvé\n\n", "🛠️ Install using:": "🛠️ Installer avec :", "💡 Note:": "💡 Note :", "🔄 After installing FFmpeg, please run this installer again:": "🔄 Après l'installation de FFmpeg, veuillez relancer cet installateur :", "Install Chocolatey first (https://chocolatey.org/)": "Installez d'abord Chocolatey (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "Installez d'abord Homebrew (https://brew.sh/)", "Use your distribution's package manager": "Utilisez le gestionnaire de paquets de votre distribution", "FFmpeg is required. Please install it and run the installer again.": "FFmpeg est requis. Veuillez l'installer et relancer l'installateur.", "Installation completed": "Installation terminée", "Now I will run this command to start the application:": "Je vais maintenant exécuter cette commande pour démarrer l'application :", "Note: First startup may take up to 1 minute": "Note : Le premier démarrage peut prendre jusqu'à 1 minute", "If the application fails to start:": "Si l'application ne démarre pas :", "Check your network connection": "Vérifiez votre connexion réseau", "Re-run the installer: [bold]python install.py[/bold]": "Relancez l'installateur : [bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "Installation des dépendances avec `pip install -r requirements.txt`", "b. Translate and Generate Subtitles": "b. Traduire et générer les sous-titres", "This stage includes the following steps:": "Cette étape comprend les étapes suivantes :", "WhisperX word-level transcription": "Transcription au niveau des mots WhisperX", "Sentence segmentation using NLP and LLM": "Segmentation des phrases utilisant NLP et LLM", "Summarization and multi-step translation": "Résumé et traduction en plusieurs étapes", "Cutting and aligning long subtitles": "Découpage et alignement des longs sous-titres", "Generating timeline and subtitles": "Génération de la chronologie et des sous-titres", "Merging subtitles into the video": "Fusion des sous-titres dans la vidéo", "Start Processing Subtitles": "Démarrer le traitement des sous-titres", "Download All Srt Files": "Télécharger tous les fichiers Srt", "Archive to 'history'": "Archiver dans 'history'", "Using Whisper for transcription...": "Utilisation de Whisper pour la transcription...", "Splitting long sentences...": "Division des longues phrases...", "Summarizing and translating...": "Résumé et traduction en cours...", "Processing and aligning subtitles...": "Traitement et alignement des sous-titres...", "Merging subtitles to video...": "Fusion des sous-titres dans la vidéo...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSE_AVANT_TRADUCTION. Allez dans `output/log/terminology.json` pour éditer la terminologie. Puis appuyez sur ENTRÉE pour continuer...", "Subtitle processing complete! 🎉": "Traitement des sous-titres terminé ! 🎉", "c. Dubbing": "c. Doublage", "Generate audio tasks and chunks": "Générer les tâches audio et les segments", "Extract reference audio": "Extraire l'audio de référence", "Generate and merge audio files": "Générer et fusionner les fichiers audio", "Merge final audio into video": "Fusionner l'audio final dans la vidéo", "Start Audio Processing": "Démarrer le traitement audio", "Audio processing is complete! You can check the audio files in the `output` folder.": "Le traitement audio est terminé ! Vous pouvez vérifier les fichiers audio dans le dossier `output`.", "Delete dubbing files": "Supprimer les fichiers de doublage", "Generate audio tasks": "Générer les tâches audio", "Extract refer audio": "Extraire l'audio de référence", "Generate all audio": "Générer tout l'audio", "Merge full audio": "Fusionner l'audio complet", "Merge dubbing to the video": "Fusionner le doublage dans la vidéo", "Audio processing complete! 🎇": "Traitement audio terminé ! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Bonjour, bienvenue sur VideoLingo. Si vous rencontrez des problèmes, n'hésitez pas à obtenir des réponses instantanées avec notre Agent QA gratuit ici ! Vous pouvez également essayer gratuitement notre site web SaaS sur videolingo.io !", "WhisperX Runtime": "Environnement WhisperX", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "L'environnement local nécessite un GPU >8GB, l'environnement cloud nécessite une clé API 302ai, l'environnement elevenlabs nécessite une clé API ElevenLabs", "WhisperX 302ai API": "API 302ai WhisperX", "Detected NVIDIA GPU(s)": "GPU(s) NVIDIA détecté(s)", "No NVIDIA GPU detected": "Aucun GPU NVIDIA détecté", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "Aucun GPU NVIDIA détecté ou pilotes NVIDIA mal installés", "LLM JSON Format Support": "Support du format JSON pour LLM", "Enable if your LLM supports JSON mode output": "Activer si votre LLM prend en charge la sortie en mode JSON" } ================================================ FILE: translations/ja.json ================================================ { "a. Download or Upload Video": "a. 動画のダウンロードまたはアップロード", "Delete and Reselect": "削除して再選択", "Enter YouTube link:": "YouTubeリンクを入力:", "Resolution": "解像度", "Download Video": "動画をダウンロード", "Or upload video": "または動画をアップロード", "Youtube Settings": "Youtube 設定", "Cookies Path": "Cookieファイルのパス", "LLM Configuration": "LLM設定", "API_KEY": "APIキー", "BASE_URL": "ベースURL", "MODEL": "モデル", "Openai format, will add /v1/chat/completions automatically": "OpenAI形式、/v1/chat/completionsが自動的に追加されます", "click to check API validity": "クリックしてAPIの有効性を確認", "API Key is valid": "APIキーは有効です", "API Key is invalid": "APIキーが無効です", "Recog Lang": "認識言語", "Subtitles Settings": "字幕設定", "Target Lang": "目標言語", "Input any language in natural language, as long as llm can understand": "LLMが理解できる限り、自然言語で任意の言語を入力してください", "Vocal separation enhance": "音声分離強化", "Burn-in Subtitles": "字幕を焼き付け", "Whether to burn subtitles into the video, will increase processing time": "字幕を動画に焼き付けるかどうか、処理時間が増加します", "Video Resolution": "動画解像度", "Recommended for videos with loud background noise, but will increase processing time": "背景ノイズの大きい動画に推奨されますが、処理時間が増加します", "Dubbing Settings": "吹き替え設定", "TTS Method": "TTS方式", "SiliconFlow API Key": "SiliconFlow APIキー", "Mode Selection": "モード選択", "Preset": "プリセット", "Refer_stable": "安定参照", "Refer_dynamic": "動的参照", "OpenAI Voice": "OpenAI音声", "Fish TTS Character": "Fish TTSキャラクター", "Azure Voice": "Azure音声", "Please refer to Github homepage for GPT_SoVITS configuration": "GPT_SoVITSの設定についてはGithubホームページを参照してください", "SoVITS Character": "SoVITSキャラクター", "Refer Mode": "参照モード", "Mode 1: Use provided reference audio only": "モード1:提供された参照音声のみを使用", "Mode 2: Use first audio from video as reference": "モード2:動画の最初の音声を参照として使用", "Mode 3: Use each audio from video as reference": "モード3:動画の各音声を参照として使用", "Configure reference audio mode for GPT-SoVITS": "GPT-SoVITSの参照音声モードを設定", "Edge TTS Voice": "Edge TTS音声", "=====NOTE=====": "以下はst.pyの内容です", "b. Translate and Generate Subtitles": "b. 翻訳と字幕生成", "This stage includes the following steps:": "このステージには以下の手順が含まれます:", "WhisperX word-level transcription": "WhisperX単語レベル文字起こし", "Sentence segmentation using NLP and LLM": "NLPとLLMを使用した文章分割", "Summarization and multi-step translation": "要約と多段階翻訳", "Cutting and aligning long subtitles": "長い字幕の切断と整列", "Generating timeline and subtitles": "タイムラインと字幕の生成", "Merging subtitles into the video": "字幕を動画に統合", "Start Processing Subtitles": "字幕処理を開始", "Download All Srt Files": "すべてのSrtファイルをダウンロード", "Archive to 'history'": "'history'にアーカイブ", "Using Whisper for transcription...": "Whisperで文字起こしを実行中...", "Splitting long sentences...": "長文を分割中...", "Summarizing and translating...": "要約と翻訳中...", "Processing and aligning subtitles...": "字幕の処理と整列中...", "Merging subtitles to video...": "字幕を動画に統合中...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻訳前に一時停止。`output/log/terminology.json`で用語を編集してください。その後、Enterキーを押して続行...", "Subtitle processing complete! 🎉": "字幕処理完了!🎉", "c. Dubbing": "c. 吹き替え", "Generate audio tasks and chunks": "音声タスクとチャンクの生成", "Extract reference audio": "参照音声の抽出", "Generate and merge audio files": "音声ファイルの生成と統合", "Merge final audio into video": "最終音声を動画に統合", "Start Audio Processing": "音声処理を開始", "Audio processing is complete! You can check the audio files in the `output` folder.": "音声処理が完了しました!`output`フォルダで音声ファイルを確認できます。", "Delete dubbing files": "吹き替えファイルを削除", "Generate audio tasks": "音声タスクを生成", "Extract refer audio": "参照音声を抽出", "Generate all audio": "すべての音声を生成", "Merge full audio": "完全な音声を統合", "Merge dubbing to the video": "吹き替えを動画に統合", "Audio processing complete! 🎇": "音声処理完了!🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "VideoLingoへようこそ。問題が発生した場合は、無料のQAエージェントこちらで即座に回答を得ることができます!また、SaaSウェブサイトvideolingo.ioを無料でお試しいただけます!", "WhisperX Runtime": "WhisperX ランタイム", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "ローカルランタイムは8GB以上のGPUが必要、クラウドランタイムは302ai APIキーが必要です、elevenlabsランタイムはElevenLabs APIキーが必要です", "WhisperX 302ai API": "WhisperX 302ai API", "=====NOTE2=====": "以下はinstall.pyの内容です", "🚀 Starting Installation": "🚀 インストールを開始", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "PyPIミラーを自動設定しますか?(pypi.orgへのアクセスが困難な場合は推奨)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 NVIDIA GPUを検出、PyTorchのCUDAバージョンをインストール中...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOSを検出、PyTorchのCPUバージョンをインストール中... 注:whisperX文字起こし時に遅くなる可能性があります。", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 NVIDIA GPUが検出されません、PyTorchのCPUバージョンをインストール中... 注:whisperX文字起こし時に遅くなる可能性があります。", "❌ Failed to install requirements:": "❌ 要件のインストールに失敗:", "✅ FFmpeg is already installed": "✅ FFmpegはすでにインストールされています", "❌ FFmpeg not found\n\n": "❌ FFmpegが見つかりません\n\n", "🛠️ Install using:": "🛠️ インストール方法:", "💡 Note:": "💡 注意:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 FFmpegをインストールした後、このインストーラーを再度実行してください:", "Install Chocolatey first (https://chocolatey.org/)": "最初にChocolateyをインストールしてください (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "最初にHomebrewをインストールしてください (https://brew.sh/)", "Use your distribution's package manager": "お使いのディストリビューションのパッケージマネージャーを使用してください", "FFmpeg is required. Please install it and run the installer again.": "FFmpegが必要です。インストールして、インストーラーを再度実行してください。", "Installation completed": "インストール完了", "Now I will run this command to start the application:": "次のコマンドでアプリケーションを起動します:", "Note: First startup may take up to 1 minute": "注:初回起動には最大1分かかる場合があります", "If the application fails to start:": "アプリケーションが起動しない場合:", "Check your network connection": "ネットワーク接続を確認してください", "Re-run the installer: [bold]python install.py[/bold]": "インストーラーを再実行: [bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "依存関係を `pip install -r requirements.txt` でインストール中", "Detected NVIDIA GPU(s)": "NVIDIA GPUを検出しました", "No NVIDIA GPU detected": "NVIDIA GPUが検出されません", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "NVIDIA GPUが検出されないか、NVIDIAドライバーが正しくインストールされていません", "LLM JSON Format Support": "LLM JSON形式サポート", "Enable if your LLM supports JSON mode output": "LLMがJSON出力モードをサポートしている場合に有効化" } ================================================ FILE: translations/ru.json ================================================ { "a. Download or Upload Video": "a. Скачать или загрузить видео", "Delete and Reselect": "Удалить и выбрать заново", "Enter YouTube link:": "Введите ссылку YouTube:", "Resolution": "Разрешение", "Download Video": "Скачать видео", "Or upload video": "Или загрузить видео", "Youtube Settings": "Настройки Youtube", "Cookies Path": "Путь к файлу Cookies", "LLM Configuration": "Настройка LLM", "API_KEY": "API-ключ", "BASE_URL": "Базовый URL", "MODEL": "Модель", "Openai format, will add /v1/chat/completions automatically": "Формат OpenAI, /v1/chat/completions добавится автоматически", "click to check API validity": "нажмите для проверки API", "API Key is valid": "API-ключ действителен", "API Key is invalid": "API-ключ недействителен", "Recog Lang": "Язык распознавания", "Subtitles Settings": "Настройки субтитров", "Target Lang": "Целевой язык", "Input any language in natural language, as long as llm can understand": "Введите любой язык на естественном языке, главное чтобы LLM мог понять", "Vocal separation enhance": "Улучшение отделения голоса", "Burn-in Subtitles": "Встроить субтитры", "Whether to burn subtitles into the video, will increase processing time": "Встраивать ли субтитры в видео, это увеличит время обработки", "Video Resolution": "Разрешение видео", "Recommended for videos with loud background noise, but will increase processing time": "Рекомендуется для видео с громким фоновым шумом, но увеличит время обработки", "Dubbing Settings": "Настройки дубляжа", "TTS Method": "Метод TTS", "SiliconFlow API Key": "API-ключ SiliconFlow", "Mode Selection": "Выбор режима", "Preset": "Пресет", "Refer_stable": "Стабильная ссылка", "Refer_dynamic": "Динамическая ссылка", "OpenAI Voice": "Голос OpenAI", "Fish TTS Character": "Персонаж Fish TTS", "Azure Voice": "Голос Azure", "Please refer to Github homepage for GPT_SoVITS configuration": "Обратитесь к домашней странице Github для настройки GPT_SoVITS", "SoVITS Character": "Персонаж SoVITS", "Refer Mode": "Режим ссылки", "Mode 1: Use provided reference audio only": "Режим 1: Использовать только предоставленное эталонное аудио", "Mode 2: Use first audio from video as reference": "Режим 2: Использовать первое аудио из видео как эталон", "Mode 3: Use each audio from video as reference": "Режим 3: Использовать каждое аудио из видео как эталон", "Configure reference audio mode for GPT-SoVITS": "Настройка режима эталонного аудио для GPT-SoVITS", "Edge TTS Voice": "Голос Edge TTS", "=====NOTE=====": "Содержимое st.py ниже", "=====NOTE2=====": "Ниже содержится в install.py", "🚀 Starting Installation": "🚀 Начало установки", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Нужно ли автоматически настроить зеркала PyPI? (Рекомендуется при проблемах с доступом к pypi.org)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 Обнаружен GPU NVIDIA, установка CUDA версии PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 Обнаружена MacOS, установка CPU версии PyTorch... Примечание: транскрипция whisperX может быть медленной.", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 GPU NVIDIA не обнаружен, установка CPU версии PyTorch... Примечание: транскрипция whisperX может быть медленной.", "❌ Failed to install requirements:": "❌ Не удалось установить зависимости:", "✅ FFmpeg is already installed": "✅ FFmpeg уже установлен", "❌ FFmpeg not found\n\n": "❌ FFmpeg не найден\n\n", "🛠️ Install using:": "🛠️ Установить используя:", "💡 Note:": "💡 Примечание:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 После установки FFmpeg, пожалуйста, запустите установщик снова:", "Install Chocolatey first (https://chocolatey.org/)": "Сначала установите Chocolatey (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "Сначала установите Homebrew (https://brew.sh/)", "Use your distribution's package manager": "Используйте менеджер пакетов вашего дистрибутива", "FFmpeg is required. Please install it and run the installer again.": "Требуется FFmpeg. Пожалуйста, установите его и запустите установщик снова.", "Installation completed": "Установка завершена", "Now I will run this command to start the application:": "Сейчас я запущу эту команду для запуска приложения:", "Note: First startup may take up to 1 minute": "Примечание: Первый запуск может занять до 1 минуты", "If the application fails to start:": "Если приложение не запускается:", "Check your network connection": "Проверьте подключение к сети", "Re-run the installer: [bold]python install.py[/bold]": "Перезапустите установщик: [bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "Установка зависимостей с помощью `pip install -r requirements.txt`", "b. Translate and Generate Subtitles": "b. Перевести и создать субтитры", "This stage includes the following steps:": "Этот этап включает следующие шаги:", "WhisperX word-level transcription": "Пословная транскрипция WhisperX", "Sentence segmentation using NLP and LLM": "Сегментация предложений с помощью NLP и LLM", "Summarization and multi-step translation": "Обобщение и многоэтапный перевод", "Cutting and aligning long subtitles": "Разделение и выравнивание длинных субтитров", "Generating timeline and subtitles": "Создание таймлайна и субтитров", "Merging subtitles into the video": "Объединение субтитров с видео", "Start Processing Subtitles": "Начать обработку субтитров", "Download All Srt Files": "Скачать все Srt файлы", "Archive to 'history'": "Архивировать в 'history'", "Using Whisper for transcription...": "Используется Whisper для транскрипции...", "Splitting long sentences...": "Разделение длинных предложений...", "Summarizing and translating...": "Обобщение и перевод...", "Processing and aligning subtitles...": "Обработка и выравнивание субтитров...", "Merging subtitles to video...": "Объединение субтитров с видео...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ ПАУЗА_ПЕРЕД_ПЕРЕВОДОМ. Перейдите в `output/log/terminology.json` для редактирования терминологии. Затем нажмите ENTER для продолжения...", "Subtitle processing complete! 🎉": "Обработка субтитров завершена! 🎉", "c. Dubbing": "c. Дубляж", "Generate audio tasks and chunks": "Создание аудио задач и фрагментов", "Extract reference audio": "Извлечение эталонного аудио", "Generate and merge audio files": "Создание и объединение аудио файлов", "Merge final audio into video": "Объединение финального аудио с видео", "Start Audio Processing": "Начать обработку аудио", "Audio processing is complete! You can check the audio files in the `output` folder.": "Обработка аудио завершена! Вы можете проверить аудио файлы в папке `output`.", "Delete dubbing files": "Удалить файлы дубляжа", "Generate audio tasks": "Создать аудио задачи", "Extract refer audio": "Извлечь эталонное аудио", "Generate all audio": "Создать все аудио", "Merge full audio": "Объединить полное аудио", "Merge dubbing to the video": "Объединить дубляж с видео", "Audio processing complete! 🎇": "Обработка аудио завершена! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Здравствуйте, добро пожаловать в VideoLingo. Если у вас возникнут вопросы, вы можете получить мгновенные ответы с помощью нашего бесплатного QA-агента здесь! Вы также можете бесплатно попробовать наш SaaS-сайт videolingo.io!", "WhisperX Runtime": "Среда выполнения WhisperX", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "Локальная среда требует GPU >8ГБ, облачная среда требует API-ключ 302ai, elevenlabs среда требует API-ключ ElevenLabs", "WhisperX 302ai API": "API 302ai WhisperX", "Detected NVIDIA GPU(s)": "Обнаружен(ы) GPU NVIDIA", "No NVIDIA GPU detected": "GPU NVIDIA не обнаружен", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "GPU NVIDIA не обнаружен или драйверы NVIDIA установлены неправильно", "LLM JSON Format Support": "Поддержка формата JSON для LLM", "Enable if your LLM supports JSON mode output": "Включите, если ваш LLM поддерживает вывод в формате JSON" } ================================================ FILE: translations/translations.py ================================================ import json DISPLAY_LANGUAGES = { "🇬🇧 English": "en", "🇨🇳 简体中文": "zh-CN", "🇭🇰 繁体中文": "zh-HK", "🇯🇵 日本語": "ja", "🇪🇸 Español": "es", "🇷🇺 Русский": "ru", "🇫🇷 Français": "fr", } # Load the language file based on user selection def load_translations(language="en"): with open(f'translations/{language}.json', 'r', encoding='utf-8') as file: return json.load(file) # Function to fetch the translation def translate(key): from core.utils.config_utils import load_key try: display_language = load_key("display_language") translations = load_translations(display_language) translation = translations.get(key) if translation is None: print(f"Warning: Translation not found for key '{key}' in language '{display_language}'") return key return translation except: return key ================================================ FILE: translations/zh-CN.json ================================================ { "a. Download or Upload Video": "a. 下载或上传视频", "Delete and Reselect": "删除并重新选择", "Enter YouTube link:": "输入YouTube链接:", "Resolution": "分辨率", "Download Video": "下载视频", "Or upload video": "或上传视频", "Youtube Settings": "Youtube设置", "Cookies Path": "Cookies文件路径", "LLM Configuration": "LLM配置", "API_KEY": "API密钥", "BASE_URL": "BASE_URL", "MODEL": "模型", "Openai format, will add /v1/chat/completions automatically": "OpenAI格式,将自动添加/v1/chat/completions", "click to check API validity": "点击检查API有效性", "API Key is valid": "API密钥有效", "API Key is invalid": "API密钥无效", "Recog Lang": "识别语言", "Subtitles Settings": "字幕设置", "Target Lang": "目标语言", "Input any language in natural language, as long as llm can understand": "用自然语言输入任何语言,只要LLM能理解即可", "Vocal separation enhance": "人声分离增强", "Burn-in Subtitles": "烧录字幕", "Whether to burn subtitles into the video, will increase processing time": "是否将字幕烧录到视频中,会增加处理时间", "Video Resolution": "视频分辨率", "Recommended for videos with loud background noise, but will increase processing time": "推荐用于背景噪音较大的视频,但会增加处理时间", "Dubbing Settings": "配音设置", "TTS Method": "TTS方法", "SiliconFlow API Key": "SiliconFlow API密钥", "Mode Selection": "模式选择", "Preset": "预设", "Refer_stable": "稳定参考", "Refer_dynamic": "动态参考", "OpenAI Voice": "OpenAI语音", "Fish TTS Character": "Fish TTS角色", "Azure Voice": "Azure语音", "Please refer to Github homepage for GPT_SoVITS configuration": "请参考Github主页了解GPT_SoVITS配置", "SoVITS Character": "SoVITS角色", "Refer Mode": "参考模式", "Mode 1: Use provided reference audio only": "模式1:仅使用提供的参考音频", "Mode 2: Use first audio from video as reference": "模式2:使用视频中的第一段音频作为参考", "Mode 3: Use each audio from video as reference": "模式3:使用视频中的每段音频作为参考", "Configure reference audio mode for GPT-SoVITS": "配置GPT-SoVITS的参考音频模式", "Edge TTS Voice": "Edge TTS语音", "=====NOTE=====": "以下是st.py中的内容", "b. Translate and Generate Subtitles": "b. 翻译并生成字幕", "This stage includes the following steps:": "此阶段包含以下步骤:", "WhisperX word-level transcription": "WhisperX词级转录", "Sentence segmentation using NLP and LLM": "使用NLP和LLM进行句子分段", "Summarization and multi-step translation": "摘要和多步翻译", "Cutting and aligning long subtitles": "切割和对齐长字幕", "Generating timeline and subtitles": "生成时间轴和字幕", "Merging subtitles into the video": "将字幕合并到视频中", "Start Processing Subtitles": "开始处理字幕", "Download All Srt Files": "下载所有Srt文件", "Archive to 'history'": "归档到'history'", "Using Whisper for transcription...": "正在使用Whisper进行转录...", "Splitting long sentences...": "正在分割长句...", "Summarizing and translating...": "正在总结和翻译...", "Processing and aligning subtitles...": "正在处理和对齐字幕...", "Merging subtitles to video...": "正在将字幕合并到视频...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻译前暂停。请前往`output/log/terminology.json`编辑术语。然后按回车键继续...", "Subtitle processing complete! 🎉": "字幕处理完成! 🎉", "c. Dubbing": "c. 配音", "Generate audio tasks and chunks": "生成音频任务和分块", "Extract reference audio": "提取参考音频", "Generate and merge audio files": "生成和合并音频文件", "Merge final audio into video": "将最终音频合并到视频中", "Start Audio Processing": "开始音频处理", "Audio processing is complete! You can check the audio files in the `output` folder.": "音频处理完成!您可以在`output`文件夹中查看音频文件。", "Delete dubbing files": "删除配音文件", "Generate audio tasks": "生成音频任务", "Extract refer audio": "提取参考音频", "Generate all audio": "生成所有音频", "Merge full audio": "合并完整音频", "Merge dubbing to the video": "将配音合并到视频中", "Audio processing complete! 🎇": "音频处理完成! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "欢迎来到VideoLingo。如果遇到任何问题,随时可以通过我们的免费问答助手 here 获取即时解答!还可以免费试用我们的SaaS网站 videolingo.io!", "WhisperX Runtime": "WhisperX 运行环境", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "本地运行需要>8GB显存GPU,云端运行需要302ai API密钥,elevenlabs运行需要ElevenLabs API密钥", "WhisperX 302ai API": "WhisperX 302ai API密钥", "=====NOTE2=====": "以下是install.py中的内容", "🚀 Starting Installation": "🚀 开始安装", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "是否需要自动配置PyPI镜像?(如果访问pypi.org困难,建议使用)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 检测到NVIDIA GPU,正在安装CUDA版本的PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 检测到MacOS,正在安装CPU版本的PyTorch... 注意:在whisperX转录过程中可能会较慢。", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 未检测到NVIDIA GPU,正在安装CPU版本的PyTorch... 注意:在whisperX转录过程中可能会较慢。", "❌ Failed to install requirements:": "❌ 安装依赖失败:", "✅ FFmpeg is already installed": "✅ FFmpeg已安装", "❌ FFmpeg not found\n\n": "❌ 未找到FFmpeg\n\n", "🛠️ Install using:": "🛠️ 使用以下命令安装:", "💡 Note:": "💡 注意:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 安装FFmpeg后,请重新运行此安装程序:", "Install Chocolatey first (https://chocolatey.org/)": "请先安装Chocolatey (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "请先安装Homebrew (https://brew.sh/)", "Use your distribution's package manager": "使用您的发行版包管理器", "FFmpeg is required. Please install it and run the installer again.": "需要安装FFmpeg。请安装后重新运行安装程序。", "Installation completed": "安装完成", "Now I will run this command to start the application:": "现在我将运行以下命令启动应用:", "Note: First startup may take up to 1 minute": "注意:首次启动可能需要最多1分钟", "If the application fails to start:": "如果应用启动失败:", "Check your network connection": "检查网络连接", "Re-run the installer: [bold]python install.py[/bold]": "重新运行安装程序:[bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "正在使用 `pip install -r requirements.txt` 安装依赖", "Detected NVIDIA GPU(s)": "检测到NVIDIA GPU", "No NVIDIA GPU detected": "未检测到NVIDIA GPU", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "未检测到NVIDIA GPU或NVIDIA驱动未正确安装", "LLM JSON Format Support": "LLM JSON格式支持", "Enable if your LLM supports JSON mode output": "如果选用的LLM支持JSON模式输出,请启用" } ================================================ FILE: translations/zh-HK.json ================================================ { "a. Download or Upload Video": "a. 下載或上傳影片", "Delete and Reselect": "刪除並重新選擇", "Enter YouTube link:": "輸入YouTube連結:", "Resolution": "解析度", "Download Video": "下載影片", "Or upload video": "或上傳影片", "Youtube Settings": "Youtube設定", "Cookies Path": "Cookies文件路徑", "LLM Configuration": "LLM設定", "API_KEY": "API金鑰", "BASE_URL": "BASE_URL", "MODEL": "模型", "Openai format, will add /v1/chat/completions automatically": "OpenAI格式,將自動添加/v1/chat/completions", "click to check API validity": "點擊檢查API有效性", "API Key is valid": "API金鑰有效", "API Key is invalid": "API金鑰無效", "Recog Lang": "識別語言", "Subtitles Settings": "字幕設定", "Target Lang": "目標語言", "Input any language in natural language, as long as llm can understand": "用自然語言輸入任何語言,只要LLM能理解即可", "Vocal separation enhance": "人聲分離增強", "Burn-in Subtitles": "燒錄字幕", "Whether to burn subtitles into the video, will increase processing time": "是否將字幕燒錄到影片中,會增加處理時間", "Video Resolution": "影片解析度", "Recommended for videos with loud background noise, but will increase processing time": "建議用於背景噪音較大的影片,但會增加處理時間", "Dubbing Settings": "配音設定", "TTS Method": "TTS方法", "SiliconFlow API Key": "SiliconFlow API金鑰", "Mode Selection": "模式選擇", "Preset": "預設", "Refer_stable": "穩定參考", "Refer_dynamic": "動態參考", "OpenAI Voice": "OpenAI語音", "Fish TTS Character": "Fish TTS角色", "Azure Voice": "Azure語音", "Please refer to Github homepage for GPT_SoVITS configuration": "請參考Github主頁了解GPT_SoVITS設定", "SoVITS Character": "SoVITS角色", "Refer Mode": "參考模式", "Mode 1: Use provided reference audio only": "模式1:僅使用提供的參考音頻", "Mode 2: Use first audio from video as reference": "模式2:使用影片中的第一段音頻作為參考", "Mode 3: Use each audio from video as reference": "模式3:使用影片中的每段音頻作為參考", "Configure reference audio mode for GPT-SoVITS": "配置GPT-SoVITS的參考音頻模式", "Edge TTS Voice": "Edge TTS語音", "=====NOTE=====": "以下是st.py中的內容", "b. Translate and Generate Subtitles": "b. 翻譯並生成字幕", "This stage includes the following steps:": "此階段包含以下步驟:", "WhisperX word-level transcription": "WhisperX詞級轉錄", "Sentence segmentation using NLP and LLM": "使用NLP和LLM進行句子分段", "Summarization and multi-step translation": "摘要和多步翻譯", "Cutting and aligning long subtitles": "切割和對齊長字幕", "Generating timeline and subtitles": "生成時間軸和字幕", "Merging subtitles into the video": "將字幕合併到影片中", "Start Processing Subtitles": "開始處理字幕", "Download All Srt Files": "下載所有Srt檔案", "Archive to 'history'": "歸檔到'history'", "Using Whisper for transcription...": "正在使用Whisper進行轉錄...", "Splitting long sentences...": "正在分割長句...", "Summarizing and translating...": "正在總結和翻譯...", "Processing and aligning subtitles...": "正在處理和對齊字幕...", "Merging subtitles to video...": "正在將字幕合併到影片中...", "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻譯前暫停。請前往`output/log/terminology.json`編輯術語。然後按Enter鍵繼續...", "Subtitle processing complete! 🎉": "字幕處理完成! 🎉", "c. Dubbing": "c. 配音", "Generate audio tasks and chunks": "生成音頻任務和分塊", "Extract reference audio": "提取參考音頻", "Generate and merge audio files": "生成和合併音頻檔案", "Merge final audio into video": "將最終音頻合併到影片中", "Start Audio Processing": "開始音頻處理", "Audio processing is complete! You can check the audio files in the `output` folder.": "音頻處理完成!您可以在`output`資料夾中查看音頻檔案。", "Delete dubbing files": "刪除配音檔案", "Generate audio tasks": "生成音頻任務", "Extract refer audio": "提取參考音頻", "Generate all audio": "生成所有音頻", "Merge full audio": "合併完整音頻", "Merge dubbing to the video": "將配音合併到影片中", "Audio processing complete! 🎇": "音頻處理完成! 🎇", "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "歡迎來到VideoLingo。如果遇到任何問題,隨時可以透過我們的免費問答助手 here 獲取即時解答!還可以免費試用我們的SaaS網站 videolingo.io!", "WhisperX Runtime": "WhisperX 運行環境", "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "本地運行需要>8GB顯存GPU,雲端運行需要302ai API金鑰,elevenlabs運行需要ElevenLabs API金鑰", "WhisperX 302ai API": "WhisperX 302ai API金鑰", "=====NOTE2=====": "以下是install.py中的內容", "🚀 Starting Installation": "🚀 開始安裝", "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "是否需要自動配置PyPI鏡像?(如果訪問pypi.org困難,建議使用)", "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 檢測到NVIDIA GPU,正在安裝CUDA版本的PyTorch...", "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 檢測到MacOS,正在安裝CPU版本的PyTorch... 注意:在whisperX轉錄過程中可能會較慢。", "💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 未檢測到NVIDIA GPU,正在安裝CPU版本的PyTorch... 注意:在whisperX轉錄過程中可能會較慢。", "❌ Failed to install requirements:": "❌ 安裝依賴失敗:", "✅ FFmpeg is already installed": "✅ FFmpeg已安裝", "❌ FFmpeg not found\n\n": "❌ 未找到FFmpeg\n\n", "🛠️ Install using:": "🛠️ 使用以下命令安裝:", "💡 Note:": "💡 注意:", "🔄 After installing FFmpeg, please run this installer again:": "🔄 安裝FFmpeg後,請重新運行此安裝程序:", "Install Chocolatey first (https://chocolatey.org/)": "請先安裝Chocolatey (https://chocolatey.org/)", "Install Homebrew first (https://brew.sh/)": "請先安裝Homebrew (https://brew.sh/)", "Use your distribution's package manager": "使用您的發行版套件管理器", "FFmpeg is required. Please install it and run the installer again.": "需要安裝FFmpeg。請安裝後重新運行安裝程序。", "Installation completed": "安裝完成", "Now I will run this command to start the application:": "現在我將運行以下命令啟動應用:", "Note: First startup may take up to 1 minute": "注意:首次啟動可能需要最多1分鐘", "If the application fails to start:": "如果應用啟動失敗:", "Check your network connection": "檢查網絡連接", "Re-run the installer: [bold]python install.py[/bold]": "重新運行安裝程序:[bold]python install.py[/bold]", "Installing requirements using `pip install -r requirements.txt`": "正在使用 `pip install -r requirements.txt` 安裝依賴", "Detected NVIDIA GPU(s)": "檢測到NVIDIA GPU", "No NVIDIA GPU detected": "未檢測到NVIDIA GPU", "No NVIDIA GPU detected or NVIDIA drivers not properly installed": "未檢測到NVIDIA GPU或NVIDIA驅動未正確安裝", "LLM JSON Format Support": "LLM JSON格式支持", "Enable if your LLM supports JSON mode output": "如果選用的LLM支持JSON模式輸出,請啟用" }