Repository: Huanshere/VideoLingo
Branch: main
Commit: 29e240dcd2a1
Files: 132
Total size: 541.1 KB
Directory structure:
gitextract_wyfw8khg/
├── .cursorrules
├── .gitignore
├── .streamlit/
│ └── config.toml
├── Dockerfile
├── LICENSE
├── OneKeyStart.bat
├── README.md
├── VideoLingo_colab.ipynb
├── batch/
│ ├── OneKeyBatch.bat
│ ├── README.md
│ ├── README.zh.md
│ └── utils/
│ ├── batch_processor.py
│ ├── settings_check.py
│ └── video_processor.py
├── config.yaml
├── core/
│ ├── _10_gen_audio.py
│ ├── _11_merge_audio.py
│ ├── _12_dub_to_vid.py
│ ├── _1_ytdlp.py
│ ├── _2_asr.py
│ ├── _3_1_split_nlp.py
│ ├── _3_2_split_meaning.py
│ ├── _4_1_summarize.py
│ ├── _4_2_translate.py
│ ├── _5_split_sub.py
│ ├── _6_gen_sub.py
│ ├── _7_sub_into_vid.py
│ ├── _8_1_audio_task.py
│ ├── _8_2_dub_chunks.py
│ ├── _9_refer_audio.py
│ ├── __init__.py
│ ├── asr_backend/
│ │ ├── __init__.py
│ │ ├── audio_preprocess.py
│ │ ├── demucs_vl.py
│ │ ├── elevenlabs_asr.py
│ │ ├── whisperX_302.py
│ │ └── whisperX_local.py
│ ├── prompts.py
│ ├── spacy_utils/
│ │ ├── __init__.py
│ │ ├── load_nlp_model.py
│ │ ├── split_by_comma.py
│ │ ├── split_by_connector.py
│ │ ├── split_by_mark.py
│ │ └── split_long_by_root.py
│ ├── st_utils/
│ │ ├── __init__.py
│ │ ├── download_video_section.py
│ │ ├── imports_and_utils.py
│ │ └── sidebar_setting.py
│ ├── translate_lines.py
│ ├── tts_backend/
│ │ ├── _302_f5tts.py
│ │ ├── azure_tts.py
│ │ ├── custom_tts.py
│ │ ├── edge_tts.py
│ │ ├── estimate_duration.py
│ │ ├── fish_tts.py
│ │ ├── gpt_sovits_tts.py
│ │ ├── openai_tts.py
│ │ ├── sf_cosyvoice2.py
│ │ ├── sf_fishtts.py
│ │ └── tts_main.py
│ └── utils/
│ ├── __init__.py
│ ├── ask_gpt.py
│ ├── config_utils.py
│ ├── decorator.py
│ ├── delete_retry_dubbing.py
│ ├── models.py
│ ├── onekeycleanup.py
│ └── pypi_autochoose.py
├── custom_terms.xlsx
├── docs/
│ ├── .gitignore
│ ├── components/
│ │ ├── landing/
│ │ │ ├── comments.tsx
│ │ │ ├── faq.tsx
│ │ │ ├── features.tsx
│ │ │ ├── github-stats.tsx
│ │ │ ├── hero.tsx
│ │ │ └── index.tsx
│ │ └── ui/
│ │ ├── accordion.tsx
│ │ ├── badge.tsx
│ │ ├── button.tsx
│ │ ├── card.tsx
│ │ ├── hero-video-dialog.tsx
│ │ ├── rainbow-button.tsx
│ │ └── tooltip.tsx
│ ├── components.json
│ ├── lib/
│ │ └── utils.ts
│ ├── middleware.js
│ ├── next-env.d.ts
│ ├── next.config.js
│ ├── package.json
│ ├── pages/
│ │ ├── _app.mdx
│ │ ├── _meta.en-US.json
│ │ ├── _meta.ja.json
│ │ ├── _meta.zh-CN.json
│ │ ├── docs/
│ │ │ ├── _meta.en-US.json
│ │ │ ├── _meta.ja.json
│ │ │ ├── _meta.zh-CN.json
│ │ │ ├── docker.en-US.md
│ │ │ ├── docker.zh-CN.md
│ │ │ ├── introduction.en-US.md
│ │ │ ├── introduction.zh-CN.md
│ │ │ ├── start.en-US.md
│ │ │ ├── start.zh-CN.md
│ │ │ ├── tech.en-US.md
│ │ │ └── tech.zh-CN.md
│ │ ├── globals.css
│ │ ├── index.en-US.mdx
│ │ ├── index.ja.mdx
│ │ └── index.zh-CN.mdx
│ ├── postcss.config.js
│ ├── public/
│ │ └── site.webmanifest
│ ├── tailwind.config.js
│ ├── theme.config.jsx
│ └── tsconfig.json
├── install.py
├── launch.py
├── requirements.txt
├── setup.py
├── st.py
└── translations/
├── README.es.md
├── README.fr.md
├── README.ja.md
├── README.ru.md
├── README.zh-TW.md
├── README.zh.md
├── en.json
├── es.json
├── fr.json
├── ja.json
├── ru.json
├── translations.py
├── zh-CN.json
└── zh-HK.json
================================================
FILE CONTENTS
================================================
================================================
FILE: .cursorrules
================================================
2. 使用
# ------------
# comment
# ------------
进行大块的注释
3. 避免使用复杂的函数内注释,以及函数变量中不要有类型定义
4. 使用英文注释和print
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# history and output
/output/
/history/
_model_cache/
batch/input/
batch/output/
batch/tasks_setting.xlsx
# large files
/ffmpeg.exe
/ffmpeg
/ffprobe.exe
/ffprobe
.DS_Store
AllinOne.ipynb
config.backup.yaml
# runtime
runtime/
dev/
installer_files/
logs/
================================================
FILE: .streamlit/config.toml
================================================
[server]
maxUploadSize = 4096
================================================
FILE: Dockerfile
================================================
ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ARG PYTHON_VERSION=3.10
# Change software sources and install basic tools and system dependencies
RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && apt-get install -y --no-install-recommends \
software-properties-common git curl sudo ffmpeg fonts-noto wget \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Clean apt cache
RUN apt-get clean && rm -rf /var/lib/apt/lists/*
# Workaround for CUDA compatibility issues
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Set working directory and clone repository
WORKDIR /app
RUN git clone https://github.com/Huanshere/VideoLingo.git .
# Install PyTorch and torchaudio
RUN pip install torch==2.0.0 torchaudio==2.0.0 --index-url https://download.pytorch.org/whl/cu118
# Clean up unnecessary files
RUN rm -rf .git
# Upgrade pip and install basic dependencies
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# Install dependencies
COPY requirements.txt .
RUN pip install -e .
# Set CUDA-related environment variables
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
# Set CUDA architecture list
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
EXPOSE 8501
CMD ["streamlit", "run", "st.py"]
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: OneKeyStart.bat
================================================
@echo off
chcp 65001 >nul 2>&1
call conda activate videolingo 2>nul
set PYTHONWARNINGS=ignore
python "%~dp0launch.py"
if %errorlevel% neq 0 (
echo.
echo Pre-flight checks or Streamlit failed. See logs\ for details.
echo.
)
pause
================================================
FILE: README.md
================================================
# Connect the World, Frame by Frame
[**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Overview ([Try VL Now!](https://videolingo.io))
VideoLingo is an all-in-one video translation, localization, and dubbing tool aimed at generating Netflix-quality subtitles. It eliminates stiff machine translations and multi-line subtitles while adding high-quality dubbing, enabling global knowledge sharing across language barriers.
Key features:
- 🎥 YouTube video download via yt-dlp
- **🎙️ Word-level and Low-illusion subtitle recognition with WhisperX**
- **📝 NLP and AI-powered subtitle segmentation**
- **📚 Custom + AI-generated terminology for coherent translation**
- **🔄 3-step Translate-Reflect-Adaptation for cinematic quality**
- **✅ Netflix-standard, Single-line subtitles Only**
- **🗣️ Dubbing with GPT-SoVITS, Azure, OpenAI, and more**
- 🚀 One-click startup and processing in Streamlit
- 🌍 Multi-language support in Streamlit UI
- 📝 Detailed logging with progress resumption
Difference from similar projects: **Single-line subtitles only, superior translation quality, seamless dubbing experience**
## 🎥 Demo
### GPT-SoVITS with my voice
---
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Language Support
**Input Language Support(more to come):**
🇺🇸 English 🤩 | 🇷🇺 Russian 😊 | 🇫🇷 French 🤩 | 🇩🇪 German 🤩 | 🇮🇹 Italian 🤩 | 🇪🇸 Spanish 🤩 | 🇯🇵 Japanese 😐 | 🇨🇳 Chinese* 😊
> *Chinese uses a separate punctuation-enhanced whisper model, for now...
**Translation supports all languages, while dubbing language depends on the chosen TTS method.**
## Installation
Meet any problem? Chat with our free online AI agent [**here**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) to help you.
> **Note:** For Windows users with NVIDIA GPU, follow these steps before installation:
> 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
> 2. Install [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. Add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to your system PATH
> 4. Restart your computer
> **Note:** FFmpeg is required. Please install it via package managers:
> - Windows: ```choco install ffmpeg``` (via [Chocolatey](https://chocolatey.org/))
> - macOS: ```brew install ffmpeg``` (via [Homebrew](https://brew.sh/))
> - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu)
1. Clone the repository
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. Install dependencies(requires `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. Start the application
```bash
streamlit run st.py
```
### Docker
Alternatively, you can use Docker (requires CUDA 12.4 and NVIDIA Driver version >550), see [Docker docs](/docs/pages/docs/docker.en-US.md):
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## APIs
VideoLingo supports OpenAI-Like API format and various TTS interfaces:
- LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (sorted by performance, be cautious with gemini-2.5-flash...)
- WhisperX: Run whisperX (large-v3) locally or use 302.ai API
- TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(You can modify your own TTS in custom_tts.py!)
> **Note:** VideoLingo works with **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - one API key for all services (LLM, WhisperX, TTS). Or run locally with Ollama and Edge-TTS for free, no API needed!
For detailed installation, API configuration, and batch mode instructions, please refer to the documentation: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md)
## Current Limitations
1. WhisperX transcription performance may be affected by video background noise, as it uses wav2vac model for alignment. For videos with loud background music, please enable Voice Separation Enhancement. Additionally, subtitles ending with numbers or special characters may be truncated early due to wav2vac's inability to map numeric characters (e.g., "1") to their spoken form ("one").
2. Using weaker models can lead to errors during processes due to strict JSON format requirements for responses (tried my best to prompt llm😊). If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error.
3. The dubbing feature may not be 100% perfect due to differences in speech rates and intonation between languages, as well as the impact of the translation step. However, this project has implemented extensive engineering processing for speech rates to ensure the best possible dubbing results.
4. **Multilingual video transcription recognition will only retain the main language**. This is because whisperX uses a specialized model for a single language when forcibly aligning word-level subtitles, and will delete unrecognized languages.
5. **For now, cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable.
## 📄 License
This project is licensed under the Apache 2.0 License. Special thanks to the following open source projects for their contributions:
[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
## 📬 Contact Me
- Submit [Issues](https://github.com/Huanshere/VideoLingo/issues) or [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) on GitHub
- DM me on Twitter: [@Huanshere](https://twitter.com/Huanshere)
- Email me at: team@videolingo.io
## ⭐ Star History
[](https://star-history.com/#Huanshere/VideoLingo&Timeline)
---
If you find VideoLingo helpful, please give me a ⭐️!
================================================
FILE: VideoLingo_colab.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "RkeSbYF2HoM_"
},
"source": [
"# Welcome to VideoLingo! 🎉🚀\n",
"#### This colab file allows you to quickly experience the full functionality in just 5 minutes! ⏱️✨ Before you begin, you may need to prepare some keys. 🔑🗝️ Please read https://videolingo.io/docs/start to get ready. 📚👍\n",
"#### *Please use a T4 GPU to execute this colab for optimal performance."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h0jE67Gc-1nO"
},
"source": [
"## 1. Clone VideoLingo repo 📥"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "NC3i2T7D51oS",
"outputId": "19821917-8ee4-4123-a099-fd35405fcdfe"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'VideoLingo'...\n",
"remote: Enumerating objects: 2578, done.\u001b[K\n",
"remote: Counting objects: 100% (595/595), done.\u001b[K\n",
"remote: Compressing objects: 100% (221/221), done.\u001b[K\n",
"remote: Total 2578 (delta 408), reused 378 (delta 374), pack-reused 1983 (from 1)\u001b[K\n",
"Receiving objects: 100% (2578/2578), 10.44 MiB | 12.60 MiB/s, done.\n",
"Resolving deltas: 100% (1644/1644), done.\n"
]
}
],
"source": [
"!git clone https://github.com/Huanshere/VideoLingo.git\n",
"%cd VideoLingo"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "A5sIHzRs8JI1"
},
"source": [
"## 2. Installation 🚀\n",
"this takes around 4 mins"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tSymHAEg6Vzr",
"outputId": "8c5059e4-37d5-4540-cba5-9c7aa46fe226"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (13.8.1)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich) (2.18.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich) (0.1.2)\n",
"\u001b[1;35m╭──────────────────────────╮\u001b[0m\n",
"\u001b[1;35m│\u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mStarting installation...\u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m│\u001b[0m\n",
"\u001b[1;35m╰──────────────────────────╯\u001b[0m\n",
"config.py file has been created. Please fill in the API key and base URL in the config.py file.\n",
"\u001b[36m╭──────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n",
"\u001b[36m│\u001b[0m\u001b[36m \u001b[0m\u001b[36mInstalling requests...\u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m│\u001b[0m\n",
"\u001b[36m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (2.32.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests) (2024.8.30)\n",
"\u001b[3m Whisper Model Selection \u001b[0m\n",
"┏━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mOption\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mDescription \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
"│\u001b[36m \u001b[0m\u001b[36m1 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mwhisperX 💻 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mlocal model (can also use online model api)\u001b[0m\u001b[32m \u001b[0m│\n",
"│\u001b[36m \u001b[0m\u001b[36m2 \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mwhisperXapi ☁️\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32monline model through api only \u001b[0m\u001b[32m \u001b[0m│\n",
"└────────┴───────────────┴─────────────────────────────────────────────┘\n",
"If you're unsure about the differences between models, please see \n",
"\u001b[4;94mhttps://github.com/Huanshere/VideoLingo/\u001b[0m\n",
"\u001b[36m╭──────────────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n",
"\u001b[36m│\u001b[0m\u001b[36m \u001b[0m\u001b[36mInstalling PyTorch with CUDA support...\u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m│\u001b[0m\n",
"\u001b[36m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
"Looking in indexes: https://download.pytorch.org/whl/cu118\n",
"Collecting torch==2.0.0\n",
" Downloading https://download.pytorch.org/whl/cu118/torch-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 GB\u001b[0m \u001b[31m626.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting torchaudio==2.0.0\n",
" Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.0.0%2Bcu118-cp310-cp310-linux_x86_64.whl (4.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.4/4.4 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.16.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (1.13.3)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch==2.0.0) (3.1.4)\n",
"Collecting triton==2.0.0 (from torch==2.0.0)\n",
" Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.3/63.3 MB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch==2.0.0) (3.30.3)\n",
"Collecting lit (from triton==2.0.0->torch==2.0.0)\n",
" Downloading https://download.pytorch.org/whl/lit-15.0.7.tar.gz (132 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.3/132.3 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch==2.0.0) (2.1.5)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch==2.0.0) (1.3.0)\n",
"Building wheels for collected packages: lit\n",
" Building wheel for lit (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for lit: filename=lit-15.0.7-py3-none-any.whl size=89990 sha256=f9f0ca0ce885e2ddbc5583433ea047a853b4034c8e2712e685d93fbed24f9204\n",
" Stored in directory: /root/.cache/pip/wheels/27/2c/b6/3ed2983b1b44fe0dea1bb35234b09f2c22fb8ebb308679c922\n",
"Successfully built lit\n",
"Installing collected packages: lit, triton, torch, torchaudio\n",
" Attempting uninstall: torch\n",
" Found existing installation: torch 2.4.1+cu121\n",
" Uninstalling torch-2.4.1+cu121:\n",
" Successfully uninstalled torch-2.4.1+cu121\n",
" Attempting uninstall: torchaudio\n",
" Found existing installation: torchaudio 2.4.1+cu121\n",
" Uninstalling torchaudio-2.4.1+cu121:\n",
" Successfully uninstalled torchaudio-2.4.1+cu121\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"torchvision 0.19.1+cu121 requires torch==2.4.1, but you have torch 2.0.0+cu118 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed lit-15.0.7 torch-2.0.0+cu118 torchaudio-2.0.0+cu118 triton-2.0.0\n",
"Installing whisperX...\n",
"Obtaining file:///content/VideoLingo/third_party/whisperX\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: torch>=2 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.0.0+cu118)\n",
"Requirement already satisfied: torchaudio>=2 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.0.0+cu118)\n",
"Collecting faster-whisper==1.0.0 (from whisperx==3.1.1)\n",
" Downloading faster_whisper-1.0.0-py3-none-any.whl.metadata (14 kB)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (4.44.2)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (2.2.2)\n",
"Requirement already satisfied: setuptools>=65 in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (71.0.4)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from whisperx==3.1.1) (3.8.1)\n",
"Collecting pyannote.audio==3.1.1 (from whisperx==3.1.1)\n",
" Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl.metadata (9.3 kB)\n",
"Collecting av==11.* (from faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)\n",
"Collecting ctranslate2<5,>=4.0 (from faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading ctranslate2-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)\n",
"Requirement already satisfied: huggingface-hub>=0.13 in /usr/local/lib/python3.10/dist-packages (from faster-whisper==1.0.0->whisperx==3.1.1) (0.24.7)\n",
"Collecting tokenizers<0.16,>=0.13 (from faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
"Collecting onnxruntime<2,>=1.14 (from faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)\n",
"Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)\n",
"Requirement already satisfied: einops>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (0.8.0)\n",
"Collecting lightning>=2.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)\n",
"Collecting omegaconf<3.0,>=2.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)\n",
"Collecting pyannote.core>=5.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)\n",
"Collecting pyannote.database>=5.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pyannote.database-5.1.0-py3-none-any.whl.metadata (1.2 kB)\n",
"Collecting pyannote.metrics>=3.2 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)\n",
"Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)\n",
"Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl.metadata (17 kB)\n",
"Requirement already satisfied: rich>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (13.8.1)\n",
"Collecting semver>=3.0.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading semver-3.0.2-py3-none-any.whl.metadata (5.0 kB)\n",
"Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.1)\n",
"Collecting speechbrain>=0.5.14 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading speechbrain-1.0.1-py3-none-any.whl.metadata (24 kB)\n",
"Collecting tensorboardX>=2.6 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)\n",
"Collecting torch-audiomentations>=0.11.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading torch_audiomentations-0.11.1-py3-none-any.whl.metadata (14 kB)\n",
"Collecting torchmetrics>=0.11.0 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading torchmetrics-1.4.2-py3-none-any.whl.metadata (19 kB)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.16.1)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (4.12.2)\n",
"Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (1.13.3)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.3)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (3.1.4)\n",
"Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=2->whisperx==3.1.1) (2.0.0)\n",
"Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=2->whisperx==3.1.1) (3.30.3)\n",
"Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=2->whisperx==3.1.1) (15.0.7)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (8.1.7)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (1.4.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (2024.9.11)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk->whisperx==3.1.1) (4.66.5)\n",
"Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->whisperx==3.1.1) (2024.2)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (24.1)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (6.0.2)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (2.32.3)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers->whisperx==3.1.1) (0.4.5)\n",
"INFO: pip is looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.\n",
"Collecting transformers (from whisperx==3.1.1)\n",
" Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.44.1-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.43.2-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m552.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hINFO: pip is still looking at multiple versions of transformers to determine which version is compatible with other requirements. This could take a while.\n",
" Downloading transformers-4.43.1-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.43.0-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.7/43.7 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.42.2-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hINFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n",
" Downloading transformers-4.42.1-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.42.0-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.6/137.6 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.13->faster-whisper==1.0.0->whisperx==3.1.1) (2024.6.1)\n",
"Collecting lightning-utilities<2.0,>=0.10.0 (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)\n",
"INFO: pip is looking at multiple versions of lightning to determine which version is compatible with other requirements. This could take a while.\n",
"Collecting lightning>=2.0.1 (from pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading lightning-2.3.3-py3-none-any.whl.metadata (35 kB)\n",
"Collecting pytorch-lightning (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)\n",
"Collecting antlr4-python3-runtime==4.9.* (from omegaconf<3.0,>=2.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)\n",
"Requirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1) (24.3.25)\n",
"Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1) (3.20.3)\n",
"Requirement already satisfied: sortedcontainers>=2.0.4 in /usr/local/lib/python3.10/dist-packages (from pyannote.core>=5.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (2.4.0)\n",
"Requirement already satisfied: scipy>=1.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.core>=5.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.13.1)\n",
"Requirement already satisfied: typer>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.database>=5.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.5)\n",
"Requirement already satisfied: scikit-learn>=0.17.1 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.5.2)\n",
"Collecting docopt>=0.6.2 (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading docopt-0.6.2.tar.gz (25 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (0.9.0)\n",
"Requirement already satisfied: matplotlib>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.7.1)\n",
"Collecting optuna>=3.1 (from pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->whisperx==3.1.1) (1.16.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (2.18.0)\n",
"Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.17.1)\n",
"Collecting hyperpyyaml (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1) (0.2.0)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=2->whisperx==3.1.1) (1.3.0)\n",
"Collecting julius<0.3,>=0.2.3 (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading julius-0.2.7.tar.gz (59 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.6/59.6 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: librosa>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.10.2.post1)\n",
"Collecting torch-pitch-shift>=1.2.2 (from torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading torch_pitch_shift-1.2.5-py3-none-any.whl.metadata (2.5 kB)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=2->whisperx==3.1.1) (2.1.5)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers->whisperx==3.1.1) (2024.8.30)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.22)\n",
"Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (3.10.6)\n",
"Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (3.0.1)\n",
"Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (4.4.2)\n",
"Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.60.0)\n",
"Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.8.2)\n",
"Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.5.0.post1)\n",
"Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.4)\n",
"Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (1.0.8)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=12.0.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.1.2)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.3.0)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (4.54.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (1.4.7)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (10.4.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.0.0->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.1.4)\n",
"Collecting alembic>=1.5.0 (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)\n",
"Collecting colorlog (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n",
"Requirement already satisfied: sqlalchemy>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.0.35)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.17.1->pyannote.metrics>=3.2->pyannote.audio==3.1.1->whisperx==3.1.1) (3.5.0)\n",
"Collecting primePy>=1.3 (from torch-pitch-shift>=1.2.2->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading primePy-1.3-py3-none-any.whl.metadata (4.8 kB)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer>=0.12.1->pyannote.database>=5.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.5.4)\n",
"Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper==1.0.0->whisperx==3.1.1)\n",
" Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)\n",
"Collecting ruamel.yaml>=0.17.28 (from hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)\n",
"INFO: pip is looking at multiple versions of pytorch-lightning to determine which version is compatible with other requirements. This could take a while.\n",
"Collecting pytorch-lightning (from lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading pytorch_lightning-2.3.3-py3-none-any.whl.metadata (21 kB)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (2.4.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (24.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.4.1)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (6.1.0)\n",
"Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (1.12.1)\n",
"Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<2026.0,>=2022.5.0->lightning>=2.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (4.0.3)\n",
"Collecting Mako (from alembic>=1.5.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)\n",
"Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (0.43.0)\n",
"Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa>=0.6.0->torch-audiomentations>=0.11.0->pyannote.audio==3.1.1->whisperx==3.1.1) (4.3.6)\n",
"Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain>=0.5.14->pyannote.audio==3.1.1->whisperx==3.1.1)\n",
" Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (2.2 kB)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from sqlalchemy>=1.3.0->optuna>=3.1->pyannote.pipeline>=3.0.1->pyannote.audio==3.1.1->whisperx==3.1.1) (3.1.1)\n",
"Downloading faster_whisper-1.0.0-py3-none-any.whl (1.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyannote.audio-3.1.1-py2.py3-none-any.whl (208 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m208.7/208.7 kB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m32.9/32.9 MB\u001b[0m \u001b[31m54.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading transformers-4.39.3-py3-none-any.whl (8.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m68.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)\n",
"Downloading ctranslate2-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.2/37.2 MB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading lightning-2.3.3-py3-none-any.whl (808 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m808.5/808.5 kB\u001b[0m \u001b[31m39.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.2/13.2 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyannote.core-5.0.0-py3-none-any.whl (58 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyannote.database-5.1.0-py3-none-any.whl (48 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.1/48.1 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyannote.metrics-3.2.1-py3-none-any.whl (51 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.4/51.4 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyannote.pipeline-3.0.1-py3-none-any.whl (31 kB)\n",
"Downloading pytorch_metric_learning-2.6.0-py3-none-any.whl (119 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.3/119.3 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading semver-3.0.2-py3-none-any.whl (17 kB)\n",
"Downloading speechbrain-1.0.1-py3-none-any.whl (807 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m807.2/807.2 kB\u001b[0m \u001b[31m47.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.7/101.7 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading torch_audiomentations-0.11.1-py3-none-any.whl (50 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.1/50.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading torchmetrics-1.4.2-py3-none-any.whl (869 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m869.2/869.2 kB\u001b[0m \u001b[31m51.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading lightning_utilities-0.11.7-py3-none-any.whl (26 kB)\n",
"Downloading optuna-4.0.0-py3-none-any.whl (362 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m362.8/362.8 kB\u001b[0m \u001b[31m27.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading torch_pitch_shift-1.2.5-py3-none-any.whl (5.0 kB)\n",
"Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading HyperPyYAML-1.2.2-py3-none-any.whl (16 kB)\n",
"Downloading pytorch_lightning-2.3.3-py3-none-any.whl (812 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m812.3/812.3 kB\u001b[0m \u001b[31m48.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.2/233.2 kB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading primePy-1.3-py3-none-any.whl (4.0 kB)\n",
"Downloading ruamel.yaml-0.18.6-py3-none-any.whl (117 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.8/117.8 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n",
"Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.7/526.7 kB\u001b[0m \u001b[31m33.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading Mako-1.3.5-py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hBuilding wheels for collected packages: antlr4-python3-runtime, docopt, julius\n",
" Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=0d45790bcba89ef25b40e28a352826b1e3b8e0a996f3c4c71c77a2e039838c51\n",
" Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n",
" Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13704 sha256=0a394444811d2361dc868cfb7f23c797a47cad9a1ee5485aa86c20551b33a0fe\n",
" Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac\n",
" Building wheel for julius (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for julius: filename=julius-0.2.7-py3-none-any.whl size=21869 sha256=c4d0f47e1e8d2846ed38f7ba03d944e5ed1068278a16c671c7b946053b7f7446\n",
" Stored in directory: /root/.cache/pip/wheels/b9/b2/05/f883527ffcb7f2ead5438a2c23439aa0c881eaa9a4c80256f4\n",
"Successfully built antlr4-python3-runtime docopt julius\n",
"Installing collected packages: primePy, docopt, antlr4-python3-runtime, tensorboardX, semver, ruamel.yaml.clib, omegaconf, Mako, lightning-utilities, humanfriendly, ctranslate2, colorlog, av, ruamel.yaml, pyannote.core, coloredlogs, alembic, tokenizers, optuna, onnxruntime, hyperpyyaml, transformers, pyannote.database, faster-whisper, pyannote.pipeline, pyannote.metrics, torchmetrics, torch-pitch-shift, pytorch-lightning, julius, torch-audiomentations, speechbrain, pytorch-metric-learning, lightning, asteroid-filterbanks, pyannote.audio, whisperx\n",
" Attempting uninstall: tokenizers\n",
" Found existing installation: tokenizers 0.19.1\n",
" Uninstalling tokenizers-0.19.1:\n",
" Successfully uninstalled tokenizers-0.19.1\n",
" Attempting uninstall: transformers\n",
" Found existing installation: transformers 4.44.2\n",
" Uninstalling transformers-4.44.2:\n",
" Successfully uninstalled transformers-4.44.2\n",
" Running setup.py develop for whisperx\n",
"Successfully installed Mako-1.3.5 alembic-1.13.3 antlr4-python3-runtime-4.9.3 asteroid-filterbanks-0.4.0 av-11.0.0 coloredlogs-15.0.1 colorlog-6.8.2 ctranslate2-4.4.0 docopt-0.6.2 faster-whisper-1.0.0 humanfriendly-10.0 hyperpyyaml-1.2.2 julius-0.2.7 lightning-2.3.3 lightning-utilities-0.11.7 omegaconf-2.3.0 onnxruntime-1.19.2 optuna-4.0.0 primePy-1.3 pyannote.audio-3.1.1 pyannote.core-5.0.0 pyannote.database-5.1.0 pyannote.metrics-3.2.1 pyannote.pipeline-3.0.1 pytorch-lightning-2.3.3 pytorch-metric-learning-2.6.0 ruamel.yaml-0.18.6 ruamel.yaml.clib-0.2.8 semver-3.0.2 speechbrain-1.0.1 tensorboardX-2.6.2.2 tokenizers-0.15.2 torch-audiomentations-0.11.1 torch-pitch-shift-1.2.5 torchmetrics-1.4.2 transformers-4.39.3 whisperx-3.1.1\n",
"Converting requirements.txt to GBK encoding...\n",
"Conversion completed.\n",
"Installing dependencies from requirements.txt...\n",
"Collecting azure-cognitiveservices-speech==1.40.0 (from -r requirements.txt (line 1))\n",
" Downloading azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)\n",
"Requirement already satisfied: librosa==0.10.2.post1 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 2)) (0.10.2.post1)\n",
"Requirement already satisfied: moviepy==1.0.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 3)) (1.0.3)\n",
"Requirement already satisfied: numpy==1.26.4 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 4)) (1.26.4)\n",
"Collecting openai==1.47.0 (from -r requirements.txt (line 5))\n",
" Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)\n",
"Requirement already satisfied: opencv-python==4.10.0.84 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 6)) (4.10.0.84)\n",
"Requirement already satisfied: openpyxl==3.1.5 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 7)) (3.1.5)\n",
"Collecting pandas==2.2.3 (from -r requirements.txt (line 8))\n",
" Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.9/89.9 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pydub==0.25.1 (from -r requirements.txt (line 9))\n",
" Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
"Requirement already satisfied: PyYAML==6.0.2 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 10)) (6.0.2)\n",
"Collecting replicate==0.33.0 (from -r requirements.txt (line 11))\n",
" Downloading replicate-0.33.0-py3-none-any.whl.metadata (25 kB)\n",
"Requirement already satisfied: requests==2.32.3 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 12)) (2.32.3)\n",
"Collecting resampy==0.4.3 (from -r requirements.txt (line 13))\n",
" Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)\n",
"Requirement already satisfied: spacy==3.7.6 in /usr/local/lib/python3.10/dist-packages (from -r requirements.txt (line 14)) (3.7.6)\n",
"Collecting streamlit==1.38.0 (from -r requirements.txt (line 15))\n",
" Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)\n",
"Collecting yt-dlp==2024.8.6 (from -r requirements.txt (line 16))\n",
" Downloading yt_dlp-2024.8.6-py3-none-any.whl.metadata (170 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m170.1/170.1 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting json-repair (from -r requirements.txt (line 17))\n",
" Downloading json_repair-0.29.7-py3-none-any.whl.metadata (10 kB)\n",
"Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (3.0.1)\n",
"Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.13.1)\n",
"Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.5.2)\n",
"Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.4.2)\n",
"Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.4.2)\n",
"Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.60.0)\n",
"Requirement already satisfied: soundfile>=0.12.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.12.1)\n",
"Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.8.2)\n",
"Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.5.0.post1)\n",
"Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.12.2)\n",
"Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.4)\n",
"Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.0.8)\n",
"Requirement already satisfied: tqdm<5.0,>=4.11.2 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (4.66.5)\n",
"Requirement already satisfied: proglog<=1.0.0 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (0.1.10)\n",
"Requirement already satisfied: imageio<3.0,>=2.5 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (2.35.1)\n",
"Requirement already satisfied: imageio-ffmpeg>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from moviepy==1.0.3->-r requirements.txt (line 3)) (0.5.1)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (3.7.1)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (1.7.0)\n",
"Collecting httpx<1,>=0.23.0 (from openai==1.47.0->-r requirements.txt (line 5))\n",
" Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)\n",
"Collecting jiter<1,>=0.4.0 (from openai==1.47.0->-r requirements.txt (line 5))\n",
" Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n",
"Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (2.9.2)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai==1.47.0->-r requirements.txt (line 5)) (1.3.1)\n",
"Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl==3.1.5->-r requirements.txt (line 7)) (1.1.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas==2.2.3->-r requirements.txt (line 8)) (2024.2)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from replicate==0.33.0->-r requirements.txt (line 11)) (24.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (2.2.3)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.32.3->-r requirements.txt (line 12)) (2024.8.30)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (8.2.5)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (1.1.3)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (2.0.10)\n",
"Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (0.4.1)\n",
"Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (0.12.5)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.1.4)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (71.0.4)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy==3.7.6->-r requirements.txt (line 14)) (3.4.1)\n",
"Requirement already satisfied: altair<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (4.2.2)\n",
"Requirement already satisfied: blinker<2,>=1.0.0 in /usr/lib/python3/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (1.4)\n",
"Requirement already satisfied: cachetools<6,>=4.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (5.5.0)\n",
"Requirement already satisfied: click<9,>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (8.1.7)\n",
"Requirement already satisfied: pillow<11,>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (10.4.0)\n",
"Requirement already satisfied: protobuf<6,>=3.20 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (3.20.3)\n",
"Requirement already satisfied: pyarrow>=7.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (16.1.0)\n",
"Requirement already satisfied: rich<14,>=10.14.0 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (13.8.1)\n",
"Collecting tenacity<9,>=8.1.0 (from streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)\n",
"Requirement already satisfied: toml<2,>=0.10.1 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (0.10.2)\n",
"Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)\n",
"Collecting pydeck<1,>=0.8.0b4 (from streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)\n",
"Requirement already satisfied: tornado<7,>=6.0.3 in /usr/local/lib/python3.10/dist-packages (from streamlit==1.38.0->-r requirements.txt (line 15)) (6.3.3)\n",
"Collecting watchdog<5,>=2.1.5 (from streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)\n",
"Collecting brotli (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n",
" Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)\n",
"Collecting mutagen (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n",
" Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)\n",
"Collecting pycryptodomex (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n",
" Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
"Collecting websockets>=12.0 (from yt-dlp==2024.8.6->-r requirements.txt (line 16))\n",
" Downloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n",
"Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.4)\n",
"Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (4.23.0)\n",
"Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.12.1)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai==1.47.0->-r requirements.txt (line 5)) (1.2.2)\n",
"Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
"Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.47.0->-r requirements.txt (line 5))\n",
" Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)\n",
"Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.47.0->-r requirements.txt (line 5))\n",
" Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
"Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.10/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.2.0)\n",
"Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa==0.10.2.post1->-r requirements.txt (line 2)) (0.43.0)\n",
"Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (4.3.6)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai==1.47.0->-r requirements.txt (line 5)) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai==1.47.0->-r requirements.txt (line 5)) (2.23.4)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->spacy==3.7.6->-r requirements.txt (line 14)) (2.1.5)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas==2.2.3->-r requirements.txt (line 8)) (1.16.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (2.18.0)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa==0.10.2.post1->-r requirements.txt (line 2)) (3.5.0)\n",
"Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (1.17.1)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r requirements.txt (line 14)) (0.7.11)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy==3.7.6->-r requirements.txt (line 14)) (0.1.5)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0.0,>=0.3.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.5.4)\n",
"Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (0.19.0)\n",
"Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (7.0.4)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.1->librosa==0.10.2.post1->-r requirements.txt (line 2)) (2.22)\n",
"Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit==1.38.0->-r requirements.txt (line 15))\n",
" Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
"Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (24.2.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (2023.12.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.35.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.20.0)\n",
"Requirement already satisfied: marisa-trie>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.2.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14,>=10.14.0->streamlit==1.38.0->-r requirements.txt (line 15)) (0.1.2)\n",
"Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy==3.7.6->-r requirements.txt (line 14)) (1.16.0)\n",
"Downloading azure_cognitiveservices_speech-1.40.0-py3-none-manylinux1_x86_64.whl (40.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.1/40.1 MB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading openai-1.47.0-py3-none-any.whl (375 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m375.6/375.6 kB\u001b[0m \u001b[31m24.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m73.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
"Downloading replicate-0.33.0-py3-none-any.whl (45 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.3/45.3 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m69.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading yt_dlp-2024.8.6-py3-none-any.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading json_repair-0.29.7-py3-none-any.whl (17 kB)\n",
"Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.3/207.3 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (318 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.9/6.9 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading tenacity-8.5.0-py3-none-any.whl (28 kB)\n",
"Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl (82 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.9/82.9 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading websockets-13.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (164 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m13.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m66.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading mutagen-1.47.0-py3-none-any.whl (194 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.4/194.4 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
"Installing collected packages: pydub, brotli, websockets, watchdog, tenacity, smmap, pycryptodomex, mutagen, json-repair, jiter, h11, azure-cognitiveservices-speech, yt-dlp, resampy, pydeck, pandas, httpcore, gitdb, httpx, gitpython, replicate, openai, streamlit\n",
" Attempting uninstall: tenacity\n",
" Found existing installation: tenacity 9.0.0\n",
" Uninstalling tenacity-9.0.0:\n",
" Successfully uninstalled tenacity-9.0.0\n",
" Attempting uninstall: pandas\n",
" Found existing installation: pandas 2.2.2\n",
" Uninstalling pandas-2.2.2:\n",
" Successfully uninstalled pandas-2.2.2\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"cudf-cu12 24.6.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 2.2.3 which is incompatible.\n",
"google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed azure-cognitiveservices-speech-1.40.0 brotli-1.1.0 gitdb-4.0.11 gitpython-3.1.43 h11-0.14.0 httpcore-1.0.6 httpx-0.27.2 jiter-0.5.0 json-repair-0.29.7 mutagen-1.47.0 openai-1.47.0 pandas-2.2.3 pycryptodomex-3.21.0 pydeck-0.9.1 pydub-0.25.1 replicate-0.33.0 resampy-0.4.3 smmap-5.0.1 streamlit-1.38.0 tenacity-8.5.0 watchdog-4.0.2 websockets-13.1 yt-dlp-2024.8.6\n",
"Downloading UVR model: HP2_all_vocals.pth...\n",
"Downloaded: 0.01%\n",
"HP2_all_vocals.pth downloaded successfully.\n",
"Downloading UVR model: VR-DeEchoAggressive.pth...\n",
"Downloaded: 0.00%\n",
"VR-DeEchoAggressive.pth downloaded successfully.\n",
"Downloading FFmpeg...\n",
"FFmpeg has been downloaded to ffmpeg.tar.xz\n",
"Extracting FFmpeg...\n",
"Cleaning up...\n",
"FFmpeg extraction completed.\n",
"\u001b[1;32m╭───────────────────────────────────────╮\u001b[0m\n",
"\u001b[1;32m│\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32mAll installation steps are completed!\u001b[0m\u001b[1;32m \u001b[0m\u001b[1;32m│\u001b[0m\n",
"\u001b[1;32m╰───────────────────────────────────────╯\u001b[0m\n",
"Please use the following command to start Streamlit:\n",
"\u001b[1;36mstreamlit run st.py\u001b[0m\n"
]
}
],
"source": [
"!python install.py"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5J2jjEY2BrM3"
},
"source": [
"## 3. Register and Obtain Ngrok Token 🔑\n",
"\n",
"1. Visit the [Ngrok official website](https://ngrok.com/) and register for an account.\n",
"2. After logging in, find the \"Your Authtoken\" section on the dashboard page, or directly visit [Ngrok Token](https://dashboard.ngrok.com/get-started/your-authtoken).\n",
"3. Copy your Ngrok Authtoken.\n",
"\n",
"After completing these steps, please fill in your ngrok token in the next section of code and proceed.\n",
"\n",
"---\n",
"\n",
"## 3. 注册并获取 Ngrok 令牌 🔑\n",
"\n",
"1. 访问 [Ngrok 官方网站](https://ngrok.com/) 并注册账户。\n",
"2. 登录后,在仪表板页面找到\"Your Authtoken\"部分,或直接访问 [Ngrok 令牌](https://dashboard.ngrok.com/get-started/your-authtoken)。\n",
"3. 复制您的 Ngrok Authtoken。\n",
"\n",
"完成这些步骤后,请在下一节代码中填入您的 ngrok 令牌并继续。"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tm25rblnBqhl",
"outputId": "93691228-1e7b-48e7-b7f3-c27007a4a13f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pyngrok in /usr/local/lib/python3.10/dist-packages (7.2.0)\n",
"Requirement already satisfied: PyYAML>=5.1 in /usr/local/lib/python3.10/dist-packages (from pyngrok) (6.0.2)\n"
]
}
],
"source": [
"!pip install pyngrok\n",
"from pyngrok import ngrok\n",
"\n",
"#! SET Ngrok Authtoken Here\n",
"ngrok.set_auth_token(\"YOUR_TOKEN_HERE\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MDJXtgMuGXMH"
},
"source": [
"## 🎈 4. Streamlit GO !!!\n",
"Click the NgrokChannel URL to start your VideoLingo Journey.\n",
"\n",
"> tips: You can set your Language down the sidebar on the left."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 294
},
"id": "qr_a4mS29k5_",
"outputId": "e6915708-d030-45d0-d6a8-cb177960c137"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.\n",
"\n"
]
},
{
"data": {
"text/html": [
"
╭───────────────────────────────────╮\n",
"│ Streamlit is available at Ngrok ⬇️ │\n",
"╰───────────────────────────────────╯\n",
"
\n"
],
"text/plain": [
"╭───────────────────────────────────╮\n",
"│ Streamlit is available at Ngrok ⬇️ │\n",
"╰───────────────────────────────────╯\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Click 👉 NgrokTunnel: \"https://0308-34-125-196-161.ngrok-free.app\" -> \"http://localhost:8501\"\n",
"\n",
" You can now view your Streamlit app in your browser.\n",
"\n",
" Local URL: http://localhost:8501\n",
" Network URL: http://172.28.0.12:8501\n",
" External URL: http://34.125.196.161:8501\n",
"\n",
" Stopping...\n",
"Interrupted by user, shutting down...\n"
]
}
],
"source": [
"import subprocess\n",
"import threading\n",
"import sys\n",
"from pyngrok import ngrok\n",
"from rich import print as rprint\n",
"from rich.panel import Panel\n",
"\n",
"def print_output(process):\n",
" for line in iter(process.stdout.readline, ''):\n",
" sys.stdout.write(line)\n",
" for line in iter(process.stderr.readline, ''):\n",
" sys.stderr.write(line)\n",
"\n",
"# Start Streamlit\n",
"streamlit_process = subprocess.Popen(\n",
" [\"streamlit\", \"run\", \"st.py\"],\n",
" stdout=subprocess.PIPE,\n",
" stderr=subprocess.PIPE,\n",
" universal_newlines=True,\n",
" bufsize=1\n",
")\n",
"\n",
"# Create and start the output printing thread\n",
"output_thread = threading.Thread(target=print_output, args=(streamlit_process,))\n",
"output_thread.start()\n",
"\n",
"# Create a tunnel using ngrok\n",
"public_url = ngrok.connect(8501)\n",
"rprint(Panel(f\"Streamlit is available at Ngrok ⬇️\", expand=False))\n",
"print(f\"Click 👉 {public_url}\")\n",
"\n",
"# Keep the program running\n",
"ngrok_process = ngrok.get_ngrok_process()\n",
"try:\n",
" streamlit_process.wait()\n",
"except KeyboardInterrupt:\n",
" print(\"Interrupted by user, shutting down...\")\n",
"finally:\n",
" ngrok.kill()\n",
" streamlit_process.terminate()\n",
" output_thread.join()\n"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
================================================
FILE: batch/OneKeyBatch.bat
================================================
@echo off
cd /D "%~dp0"
cd ..
call conda activate videolingo
@rem 运行批处理脚本
call python batch\utils\batch_processor.py
:end
pause
================================================
FILE: batch/README.md
================================================
# VideoLingo Batch Mode
[English](./README.md) | [简体中文](./README.zh.md)
Before utilizing the batch mode, ensure you have used the Streamlit mode and properly configured the parameters in `config.yaml`.
## Usage Guide
### 1. Video File Preparation
- Place your video files in the `input` folder
- YouTube links can be specified in the next step
### 2. Task Configuration
Edit the `tasks_setting.xlsx` file:
| Field | Description | Acceptable Values |
|-------|-------------|-------------------|
| Video File | Video filename (without `input/` prefix) or YouTube URL | - |
| Source Language | Source language | 'en', 'zh', ... or leave empty for default |
| Target Language | Translation language | Use natural language description, or leave empty for default |
| Dubbing | Enable dubbing | 0 or empty: no dubbing; 1: enable dubbing |
Example:
| Video File | Source Language | Target Language | Dubbing |
|------------|-----------------|-----------------|---------|
| https://www.youtube.com/xxx | | German | |
| Kungfu Panda.mp4 | | | 1 |
### 3. Executing Batch Processing
1. Double-click to run `OneKeyBatch.bat`
2. Output files will be saved in the `output` folder
3. Task status can be monitored in the `Status` column of `tasks_setting.xlsx`
> Note: Keep `tasks_setting.xlsx` closed during execution to prevent interruptions due to file access conflicts.
## Important Considerations
### Handling Interruptions
If the command line is closed unexpectedly, language settings in `config.yaml` may be altered. Check settings before retrying.
### Error Management
- Failed files will be moved to the `output/ERROR` folder
- Error messages are recorded in the `Status` column of `tasks_setting.xlsx`
- To retry:
1. Move the single video folder from `ERROR` to the root directory
2. Rename it to `output`
3. Use Streamlit mode to process again
================================================
FILE: batch/README.zh.md
================================================
# VideoLingo Batch Mode
[English](./README.md) | [简体中文](./README.zh.md)
在使用批处理模式前,请确保你已经使用过 Streamlit 模式并正确设置了 `config.yaml` 中的参数。
## 使用方法
### 1. 准备视频文件
- 将要处理的视频文件放入 `input` 文件夹
- YouTube 链接可在下一步填写
### 2. 配置任务
编辑 `tasks_setting.xlsx` 文件:
| 字段 | 说明 | 可选值 |
|------|------|--------|
| Video File | 视频文件名(无需 `input/` 前缀)或 YouTube 链接 | - |
| Source Language | 源语言 | 'en', 'zh', ... 或留空使用默认设置 |
| Target Language | 翻译语言 | 使用自然语言描述,或留空使用默认设置 |
| Dubbing | 是否配音 | 0 或留空:不配音;1:配音 |
示例:
| Video File | Source Language | Target Language | Dubbing |
|------------|-----------------|-----------------|---------|
| https://www.youtube.com/xxx | | German | |
| Kungfu Panda.mp4 | | | 1 |
### 3. 运行批处理
1. 双击运行 `OneKeyBatch.bat`
2. 输出文件将保存在 `output` 文件夹
3. 任务状态可在 `tasks_setting.xlsx` 的 `Status` 列查看
> 注意在运行时保持 `tasks_setting.xlsx` 关闭,否则会因占用无法写入而中断。
## 注意事项
### 中断处理
如果中途关闭命令行,`config.yaml` 中的语言设置可能会改变。重试前请检查设置。
### 错误处理
- 处理失败的文件会被移至 `output/ERROR` 文件夹
- 错误信息记录在 `tasks_setting.xlsx` 的 `Status` 列
- 如需重试:
1. 将 `ERROR` 下的单个视频文件夹移至根目录
2. 重命名为 `output`
3. 使用 Streamlit 模式重新执行
================================================
FILE: batch/utils/batch_processor.py
================================================
import os
import gc
from batch.utils.settings_check import check_settings
from batch.utils.video_processor import process_video
from core.utils.config_utils import load_key, update_key
import pandas as pd
from rich.console import Console
from rich.panel import Panel
import time
import shutil
console = Console()
def record_and_update_config(source_language, target_language):
original_source_lang = load_key('whisper.language')
original_target_lang = load_key('target_language')
if source_language and not pd.isna(source_language):
update_key('whisper.language', source_language)
if target_language and not pd.isna(target_language):
update_key('target_language', target_language)
return original_source_lang, original_target_lang
def process_batch():
if not check_settings():
raise Exception("Settings check failed")
df = pd.read_excel('batch/tasks_setting.xlsx')
for index, row in df.iterrows():
if pd.isna(row['Status']) or 'Error' in str(row['Status']):
total_tasks = len(df)
video_file = row['Video File']
if not pd.isna(row['Status']) and 'Error' in str(row['Status']):
console.print(Panel(f"Retrying failed task: {video_file}\nTask {index + 1}/{total_tasks}",
title="[bold yellow]Retry Task", expand=False))
# Restore files from batch/output/ERROR to output
error_folder = os.path.join('batch', 'output', 'ERROR', os.path.splitext(video_file)[0])
if os.path.exists(error_folder):
# Ensure the output folder exists
os.makedirs('output', exist_ok=True)
# Copy all contents from ERROR folder for the specific video to output
for item in os.listdir(error_folder):
src_path = os.path.join(error_folder, item)
dst_path = os.path.join('output', item)
if os.path.isdir(src_path):
if os.path.exists(dst_path):
shutil.rmtree(dst_path)
shutil.copytree(src_path, dst_path)
else:
if os.path.exists(dst_path):
os.remove(dst_path)
shutil.copy2(src_path, dst_path)
console.print(f"[green]Restored files from ERROR folder for {video_file}")
else:
console.print(f"[yellow]Warning: Error folder not found: {error_folder}")
else:
console.print(Panel(f"Now processing task: {video_file}\nTask {index + 1}/{total_tasks}",
title="[bold blue]Current Task", expand=False))
source_language = row['Source Language']
target_language = row['Target Language']
original_source_lang, original_target_lang = record_and_update_config(source_language, target_language)
try:
dubbing = 0 if pd.isna(row['Dubbing']) else int(row['Dubbing'])
is_retry = not pd.isna(row['Status']) and 'Error' in str(row['Status'])
status, error_step, error_message = process_video(video_file, dubbing, is_retry)
status_msg = "Done" if status else f"Error: {error_step} - {error_message}"
except Exception as e:
status_msg = f"Error: Unhandled exception - {str(e)}"
console.print(f"[bold red]Error processing {video_file}: {status_msg}")
finally:
update_key('whisper.language', original_source_lang)
update_key('target_language', original_target_lang)
df.at[index, 'Status'] = status_msg
df.to_excel('batch/tasks_setting.xlsx', index=False)
gc.collect()
time.sleep(1)
else:
print(f"Skipping task: {row['Video File']} - Status: {row['Status']}")
console.print(Panel("All tasks processed!\nCheck out in `batch/output`!",
title="[bold green]Batch Processing Complete", expand=False))
if __name__ == "__main__":
process_batch()
================================================
FILE: batch/utils/settings_check.py
================================================
import os
import pandas as pd
from rich.console import Console
from rich.panel import Panel
# Constants
SETTINGS_FILE = 'batch/tasks_setting.xlsx'
INPUT_FOLDER = os.path.join('batch', 'input')
VALID_DUBBING_VALUES = [0, 1]
console = Console()
def check_settings():
os.makedirs(INPUT_FOLDER, exist_ok=True)
df = pd.read_excel(SETTINGS_FILE)
input_files = set(os.listdir(INPUT_FOLDER))
excel_files = set(df['Video File'].tolist())
files_not_in_excel = input_files - excel_files
all_passed = True
local_video_tasks = 0
url_tasks = 0
if files_not_in_excel:
console.print(Panel(
"\n".join([f"- {file}" for file in files_not_in_excel]),
title="[bold red]Warning: Files in input folder not mentioned in Excel sheet",
expand=False
))
all_passed = False
for index, row in df.iterrows():
video_file = row['Video File']
source_language = row['Source Language']
dubbing = row['Dubbing']
if video_file.startswith('http'):
url_tasks += 1
elif os.path.isfile(os.path.join(INPUT_FOLDER, video_file)):
local_video_tasks += 1
else:
console.print(Panel(f"Invalid video file or URL 「{video_file}」", title=f"[bold red]Error in row {index + 2}", expand=False))
all_passed = False
if not pd.isna(dubbing):
if int(dubbing) not in VALID_DUBBING_VALUES:
console.print(Panel(f"Invalid dubbing value 「{dubbing}」", title=f"[bold red]Error in row {index + 2}", expand=False))
all_passed = False
if all_passed:
console.print(Panel(f"✅ All settings passed the check!\nDetected {local_video_tasks} local video tasks and {url_tasks} URL tasks.", title="[bold green]Success", expand=False))
return all_passed
if __name__ == "__main__":
check_settings()
================================================
FILE: batch/utils/video_processor.py
================================================
import os
from core.st_utils.imports_and_utils import *
from core.utils.onekeycleanup import cleanup
from core.utils import load_key
import shutil
from functools import partial
from rich.panel import Panel
from rich.console import Console
from core import *
console = Console()
INPUT_DIR = 'batch/input'
OUTPUT_DIR = 'output'
SAVE_DIR = 'batch/output'
ERROR_OUTPUT_DIR = 'batch/output/ERROR'
YTB_RESOLUTION_KEY = "ytb_resolution"
def process_video(file, dubbing=False, is_retry=False):
if not is_retry:
prepare_output_folder(OUTPUT_DIR)
text_steps = [
("🎥 Processing input file", partial(process_input_file, file)),
("🎙️ Transcribing with Whisper", partial(_2_asr.transcribe)),
("✂️ Splitting sentences", split_sentences),
("📝 Summarizing and translating", summarize_and_translate),
("⚡ Processing and aligning subtitles", process_and_align_subtitles),
("🎬 Merging subtitles to video", _7_sub_into_vid.merge_subtitles_to_video),
]
if dubbing:
dubbing_steps = [
("🔊 Generating audio tasks", gen_audio_tasks),
("🎵 Extracting reference audio", _9_refer_audio.extract_refer_audio_main),
("🗣️ Generating audio", _10_gen_audio.gen_audio),
("🔄 Merging full audio", _11_merge_audio.merge_full_audio),
("🎞️ Merging dubbing to video", _12_dub_to_vid.merge_video_audio),
]
text_steps.extend(dubbing_steps)
current_step = ""
for step_name, step_func in text_steps:
current_step = step_name
for attempt in range(3):
try:
console.print(Panel(
f"[bold green]{step_name}[/]",
subtitle=f"Attempt {attempt + 1}/3" if attempt > 0 else None,
border_style="blue"
))
result = step_func()
if result is not None:
globals().update(result)
break
except Exception as e:
if attempt == 2:
error_panel = Panel(
f"[bold red]Error in step '{current_step}':[/]\n{str(e)}",
border_style="red"
)
console.print(error_panel)
cleanup(ERROR_OUTPUT_DIR)
return False, current_step, str(e)
console.print(Panel(
f"[yellow]Attempt {attempt + 1} failed. Retrying...[/]",
border_style="yellow"
))
console.print(Panel("[bold green]All steps completed successfully! 🎉[/]", border_style="green"))
cleanup(SAVE_DIR)
return True, "", ""
def prepare_output_folder(output_folder):
if os.path.exists(output_folder):
shutil.rmtree(output_folder)
os.makedirs(output_folder)
def process_input_file(file):
if file.startswith('http'):
_1_ytdlp.download_video_ytdlp(file, resolution=load_key(YTB_RESOLUTION_KEY))
video_file = _1_ytdlp.find_video_files()
else:
input_file = os.path.join('batch', 'input', file)
output_file = os.path.join(OUTPUT_DIR, file)
shutil.copy(input_file, output_file)
video_file = output_file
return {'video_file': video_file}
def split_sentences():
_3_1_split_nlp.split_by_spacy()
_3_2_split_meaning.split_sentences_by_meaning()
def summarize_and_translate():
_4_1_summarize.get_summary()
_4_2_translate.translate_all()
def process_and_align_subtitles():
_5_split_sub.split_for_sub_main()
_6_gen_sub.align_timestamp_main()
def gen_audio_tasks():
_8_1_audio_task.gen_audio_task_main()
_8_2_dub_chunks.gen_dub_chunks()
================================================
FILE: config.yaml
================================================
# * Settings marked with * are advanced settings that won't appear in the Streamlit page and can only be modified manually in config.py
# recommend to set in streamlit page
# -------------------
# version: "3.0.0"
# author: "Huanshere"
# -------------------
## ======================== Basic Settings ======================== ##
display_language: "zh-CN"
# API settings
api:
key: 'your-api-key'
base_url: 'https://yunwu.ai'
model: 'gpt-4.1-2025-04-14'
llm_support_json: false
# *Number of LLM multi-threaded accesses, set to 1 if using local LLM
max_workers: 4
# Language settings, written into the prompt, can be described in natural language
target_language: '简体中文'
# Whether to use Demucs for vocal separation before transcription
demucs: true
whisper:
# ["large-v3", "large-v3-turbo"]. Note: for zh model will force to use Belle/large-v3
model: 'large-v3'
# Whisper specified recognition language ISO 639-1
language: 'en'
detected_language: 'en'
# Whisper running mode ["local", "cloud", "elevenlabs"]. Specifies where to run, cloud uses 302.ai API
runtime: 'local'
# 302.ai API key
whisperX_302_api_key: 'your_302_api_key'
# ElevenLabs API key (experimental)
elevenlabs_api_key: 'your_elevenlabs_api_key'
# Whether to burn subtitles into the video
burn_subtitles: true
## ======================== Advanced Settings ======================== ##
# *🔬 h264_nvenc GPU acceleration for ffmpeg, make sure your GPU supports it
ffmpeg_gpu: false
# *Youtube settings
youtube:
cookies_path: ''
# *Default resolution for downloading YouTube videos [360, 1080, best]
ytb_resolution: '1080'
subtitle:
# *Maximum length of each subtitle line in characters
max_length: 75
# *Translated subtitles are slightly larger than source subtitles, affecting the reference length for subtitle splitting
target_multiplier: 1.2
# *Summary length, set low to 2k if using local LLM
summary_length: 8000
# *Maximum number of words for the first rough cut, below 18 will cut too finely affecting translation, above 22 is too long and will make subsequent subtitle splitting difficult to align
max_split_length: 20
# *Whether to reflect the translation result in the original text
reflect_translate: true
# *Whether to pause after extracting professional terms and before translation, allowing users to manually adjust the terminology table output\log\terminology.json
pause_before_translate: false
## ======================== Dubbing Settings ======================== ##
# TTS selection [sf_fish_tts, openai_tts, gpt_sovits, azure_tts, fish_tts, edge_tts, custom_tts]
tts_method: 'azure_tts'
# SiliconFlow FishTTS
sf_fish_tts:
# SiliconFlow API key
api_key: 'YOUR_API_KEY'
# only for mode "preset"
voice: 'anna'
# *only for mode "custom", dont set manually
custom_name: ''
voice_id: ''
# preset, custom, dynamic
mode: "preset"
# OpenAI TTS-1 API configuration, 302.ai API only
openai_tts:
api_key: 'YOUR_302_API_KEY'
voice: 'alloy'
# Azure configuration, 302.ai API only
azure_tts:
api_key: 'YOUR_302_API_KEY'
voice: 'zh-CN-YunfengNeural'
# FishTTS configuration, 302.ai API only
fish_tts:
api_key: 'YOUR_302_API_KEY'
character: 'AD学姐'
character_id_dict:
'AD学姐': '7f92f8afb8ec43bf81429cc1c9199cb1'
'丁真': '54a5170264694bfc8e9ad98df7bd89c3'
# SiliconFlow CosyVoice2 Clone
sf_cosyvoice2:
api_key: 'YOUR_SF_KEY'
# Edge TTS configuration
edge_tts:
voice: 'zh-CN-XiaoxiaoNeural'
# SoVITS configuration
gpt_sovits:
character: 'Huanyuv2'
refer_mode: 3
f5tts:
302_api: 'YOUR_302_API_KEY'
# *Audio speed range
speed_factor:
min: 1
accept: 1.2 # Maximum acceptable speed
max: 1.4
# *Merge audio configuration
min_subtitle_duration: 2.5 # Minimum subtitle duration, will be forcibly extended
min_trim_duration: 3.5 # Subtitles shorter than this value won't be split
tolerance: 1.5 # Allowed extension time to the next subtitle
## ======================== Additional settings ======================== ##
# Whisper model directory
model_dir: './_model_cache'
# Supported upload video formats
allowed_video_formats:
- 'mp4'
- 'mov'
- 'avi'
- 'mkv'
- 'flv'
- 'wmv'
- 'webm'
allowed_audio_formats:
- 'wav'
- 'mp3'
- 'flac'
- 'm4a'
# Spacy models
spacy_model_map:
en: 'en_core_web_md'
ru: 'ru_core_news_md'
fr: 'fr_core_news_md'
ja: 'ja_core_news_md'
es: 'es_core_news_md'
de: 'de_core_news_md'
it: 'it_core_news_md'
zh: 'zh_core_web_md'
# Languages that use space as separator
language_split_with_space:
- 'en'
- 'es'
- 'fr'
- 'de'
- 'it'
- 'ru'
# Languages that do not use space as separator
language_split_without_space:
- 'zh'
- 'ja'
================================================
FILE: core/_10_gen_audio.py
================================================
import os
import time
import shutil
import subprocess
from typing import Tuple
import pandas as pd
from pydub import AudioSegment
from rich.console import Console
from rich.progress import Progress
from concurrent.futures import ThreadPoolExecutor, as_completed
from core.utils import *
from core.utils.models import *
from core.asr_backend.audio_preprocess import get_audio_duration
from core.tts_backend.tts_main import tts_main
console = Console()
TEMP_FILE_TEMPLATE = f"{_AUDIO_TMP_DIR}/{{}}_temp.wav"
OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav"
WARMUP_SIZE = 5
def parse_df_srt_time(time_str: str) -> float:
"""Convert SRT time format to seconds"""
hours, minutes, seconds = time_str.strip().split(':')
seconds, milliseconds = seconds.split('.')
return int(hours) * 3600 + int(minutes) * 60 + int(seconds) + int(milliseconds) / 1000
def adjust_audio_speed(input_file: str, output_file: str, speed_factor: float) -> None:
"""Adjust audio speed and handle edge cases"""
# If the speed factor is close to 1, directly copy the file
if abs(speed_factor - 1.0) < 0.001:
shutil.copy2(input_file, output_file)
return
atempo = speed_factor
cmd = ['ffmpeg', '-i', input_file, '-filter:a', f'atempo={atempo}', '-y', output_file]
input_duration = get_audio_duration(input_file)
max_retries = 2
for attempt in range(max_retries):
try:
subprocess.run(cmd, check=True, stderr=subprocess.PIPE)
output_duration = get_audio_duration(output_file)
expected_duration = input_duration / speed_factor
diff = output_duration - expected_duration
# If the output duration exceeds the expected duration, but the input audio is less than 3 seconds, and the error is within 0.1 seconds, truncate to the expected length
if output_duration >= expected_duration * 1.02 and input_duration < 3 and diff <= 0.1:
audio = AudioSegment.from_wav(output_file)
trimmed_audio = audio[:(expected_duration * 1000)] # pydub uses milliseconds
trimmed_audio.export(output_file, format="wav")
print(f"✂️ Trimmed to expected duration: {expected_duration:.2f} seconds")
return
elif output_duration >= expected_duration * 1.02:
raise Exception(f"Audio duration abnormal: input file={input_file}, output file={output_file}, speed factor={speed_factor}, input duration={input_duration:.2f}s, output duration={output_duration:.2f}s")
return
except subprocess.CalledProcessError as e:
if attempt < max_retries - 1:
rprint(f"[yellow]⚠️ Audio speed adjustment failed, retrying in 1s ({attempt + 1}/{max_retries})[/yellow]")
time.sleep(1)
else:
rprint(f"[red]❌ Audio speed adjustment failed, max retries reached ({max_retries})[/red]")
raise e
def process_row(row: pd.Series, tasks_df: pd.DataFrame) -> Tuple[int, float]:
"""Helper function for processing single row data"""
number = row['number']
lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines']
real_dur = 0
for line_index, line in enumerate(lines):
temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}")
tts_main(line, temp_file, number, tasks_df)
real_dur += get_audio_duration(temp_file)
return number, real_dur
def generate_tts_audio(tasks_df: pd.DataFrame) -> pd.DataFrame:
"""Generate TTS audio sequentially and calculate actual duration"""
tasks_df['real_dur'] = 0
rprint("[bold green]🎯 Starting TTS audio generation...[/bold green]")
with Progress() as progress:
task = progress.add_task("[cyan]🔄 Generating TTS audio...", total=len(tasks_df))
# warm up for first 5 rows
warmup_size = min(WARMUP_SIZE, len(tasks_df))
for _, row in tasks_df.head(warmup_size).iterrows():
try:
number, real_dur = process_row(row, tasks_df)
tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
progress.advance(task)
except Exception as e:
rprint(f"[red]❌ Error in warmup: {str(e)}[/red]")
raise e
# for gpt_sovits, do not use parallel to avoid mistakes
max_workers = load_key("max_workers") if load_key("tts_method") != "gpt_sovits" else 1
# parallel processing for remaining tasks
if len(tasks_df) > warmup_size:
remaining_tasks = tasks_df.iloc[warmup_size:].copy()
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(process_row, row, tasks_df.copy())
for _, row in remaining_tasks.iterrows()
]
for future in as_completed(futures):
try:
number, real_dur = future.result()
tasks_df.loc[tasks_df['number'] == number, 'real_dur'] = real_dur
progress.advance(task)
except Exception as e:
rprint(f"[red]❌ Error: {str(e)}[/red]")
raise e
rprint("[bold green]✨ TTS audio generation completed![/bold green]")
return tasks_df
def process_chunk(chunk_df: pd.DataFrame, accept: float, min_speed: float) -> tuple[float, bool]:
"""Process audio chunk and calculate speed factor"""
chunk_durs = chunk_df['real_dur'].sum()
tol_durs = chunk_df['tol_dur'].sum()
durations = tol_durs - chunk_df.iloc[-1]['tolerance']
all_gaps = chunk_df['gap'].sum() - chunk_df.iloc[-1]['gap']
keep_gaps = True
speed_var_error = 0.1
if (chunk_durs + all_gaps) / accept < durations:
speed_factor = max(min_speed, (chunk_durs + all_gaps) / (durations-speed_var_error))
elif chunk_durs / accept < durations:
speed_factor = max(min_speed, chunk_durs / (durations-speed_var_error))
keep_gaps = False
elif (chunk_durs + all_gaps) / accept < tol_durs:
speed_factor = max(min_speed, (chunk_durs + all_gaps) / (tol_durs-speed_var_error))
else:
speed_factor = chunk_durs / (tol_durs-speed_var_error)
keep_gaps = False
return round(speed_factor, 3), keep_gaps
def merge_chunks(tasks_df: pd.DataFrame) -> pd.DataFrame:
"""Merge audio chunks and adjust timeline"""
rprint("[bold blue]🔄 Starting audio chunks processing...[/bold blue]")
accept = load_key("speed_factor.accept")
min_speed = load_key("speed_factor.min")
chunk_start = 0
tasks_df['new_sub_times'] = None
for index, row in tasks_df.iterrows():
if row['cut_off'] == 1:
chunk_df = tasks_df.iloc[chunk_start:index+1].reset_index(drop=True)
speed_factor, keep_gaps = process_chunk(chunk_df, accept, min_speed)
# 🎯 Step1: Start processing new timeline
chunk_start_time = parse_df_srt_time(chunk_df.iloc[0]['start_time'])
chunk_end_time = parse_df_srt_time(chunk_df.iloc[-1]['end_time']) + chunk_df.iloc[-1]['tolerance'] # 加上tolerance才是这一块的结束
cur_time = chunk_start_time
for i, row in chunk_df.iterrows():
# If i is not 0, which is not the first row of the chunk, cur_time needs to be added with the gap of the previous row, remember to divide by speed_factor
if i != 0 and keep_gaps:
cur_time += chunk_df.iloc[i-1]['gap']/speed_factor
new_sub_times = []
number = row['number']
lines = eval(row['lines']) if isinstance(row['lines'], str) else row['lines']
for line_index, line in enumerate(lines):
# 🔄 Step2: Start speed change and save as OUTPUT_FILE_TEMPLATE
temp_file = TEMP_FILE_TEMPLATE.format(f"{number}_{line_index}")
output_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}")
adjust_audio_speed(temp_file, output_file, speed_factor)
ad_dur = get_audio_duration(output_file)
new_sub_times.append([cur_time, cur_time+ad_dur])
cur_time += ad_dur
# 🔄 Step3: Find corresponding main DataFrame index and update new_sub_times
main_df_idx = tasks_df[tasks_df['number'] == row['number']].index[0]
tasks_df.at[main_df_idx, 'new_sub_times'] = new_sub_times
# 🎯 Step4: Choose emoji based on speed_factor and accept comparison
emoji = "⚡" if speed_factor <= accept else "⚠️"
rprint(f"[cyan]{emoji} Processed chunk {chunk_start} to {index} with speed factor {speed_factor}[/cyan]")
# 🔄 Step5: Check if the last row exceeds the range
if cur_time > chunk_end_time:
time_diff = cur_time - chunk_end_time
if time_diff <= 0.6: # If exceeding time is within 0.6 seconds, truncate the last audio
rprint(f"[yellow]⚠️ Chunk {chunk_start} to {index} exceeds by {time_diff:.3f}s, truncating last audio[/yellow]")
# Get the last audio file
last_number = tasks_df.iloc[index]['number']
last_lines = eval(tasks_df.iloc[index]['lines']) if isinstance(tasks_df.iloc[index]['lines'], str) else tasks_df.iloc[index]['lines']
last_line_index = len(last_lines) - 1
last_file = OUTPUT_FILE_TEMPLATE.format(f"{last_number}_{last_line_index}")
# Calculate the duration to keep
audio = AudioSegment.from_wav(last_file)
original_duration = len(audio) / 1000 # Convert to seconds
new_duration = original_duration - time_diff
trimmed_audio = audio[:(new_duration * 1000)] # pydub uses milliseconds
trimmed_audio.export(last_file, format="wav")
# Update the last timestamp
last_times = tasks_df.at[index, 'new_sub_times']
last_times[-1][1] = chunk_end_time
tasks_df.at[index, 'new_sub_times'] = last_times
else:
raise Exception(f"Chunk {chunk_start} to {index} exceeds the chunk end time {chunk_end_time:.2f} seconds with current time {cur_time:.2f} seconds")
chunk_start = index+1
rprint("[bold green]✅ Audio chunks processing completed![/bold green]")
return tasks_df
def gen_audio() -> None:
"""Main function: Generate audio and process timeline"""
rprint("[bold magenta]🚀 Starting audio generation process...[/bold magenta]")
# 🎯 Step1: Create necessary directories
os.makedirs(_AUDIO_TMP_DIR, exist_ok=True)
os.makedirs(_AUDIO_SEGS_DIR, exist_ok=True)
# 📝 Step2: Load task file
tasks_df = pd.read_excel(_8_1_AUDIO_TASK)
rprint("[green]📊 Loaded task file successfully[/green]")
# 🔊 Step3: Generate TTS audio
tasks_df = generate_tts_audio(tasks_df)
# 🔄 Step4: Merge audio chunks
tasks_df = merge_chunks(tasks_df)
# 💾 Step5: Save results
tasks_df.to_excel(_8_1_AUDIO_TASK, index=False)
rprint("[bold green]🎉 Audio generation completed successfully![/bold green]")
if __name__ == "__main__":
gen_audio()
================================================
FILE: core/_11_merge_audio.py
================================================
import os
import pandas as pd
import subprocess
from pydub import AudioSegment
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
from rich.console import Console
from core.utils import *
from core.utils.models import *
console = Console()
DUB_VOCAL_FILE = 'output/dub.mp3'
DUB_SUB_FILE = 'output/dub.srt'
OUTPUT_FILE_TEMPLATE = f"{_AUDIO_SEGS_DIR}/{{}}.wav"
def load_and_flatten_data(excel_file):
"""Load and flatten Excel data"""
df = pd.read_excel(excel_file)
lines = [eval(line) if isinstance(line, str) else line for line in df['lines'].tolist()]
lines = [item for sublist in lines for item in sublist]
new_sub_times = [eval(time) if isinstance(time, str) else time for time in df['new_sub_times'].tolist()]
new_sub_times = [item for sublist in new_sub_times for item in sublist]
return df, lines, new_sub_times
def get_audio_files(df):
"""Generate a list of audio file paths"""
audios = []
for index, row in df.iterrows():
number = row['number']
line_count = len(eval(row['lines']) if isinstance(row['lines'], str) else row['lines'])
for line_index in range(line_count):
temp_file = OUTPUT_FILE_TEMPLATE.format(f"{number}_{line_index}")
audios.append(temp_file)
return audios
def process_audio_segment(audio_file):
"""Process a single audio segment with MP3 compression"""
temp_file = f"{audio_file}_temp.mp3"
ffmpeg_cmd = [
'ffmpeg', '-y',
'-i', audio_file,
'-ar', '16000',
'-ac', '1',
'-b:a', '64k',
temp_file
]
subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
audio_segment = AudioSegment.from_mp3(temp_file)
os.remove(temp_file)
return audio_segment
def merge_audio_segments(audios, new_sub_times, sample_rate):
merged_audio = AudioSegment.silent(duration=0, frame_rate=sample_rate)
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TaskProgressColumn()) as progress:
merge_task = progress.add_task("🎵 Merging audio segments...", total=len(audios))
for i, (audio_file, time_range) in enumerate(zip(audios, new_sub_times)):
if not os.path.exists(audio_file):
console.print(f"[bold yellow]⚠️ Warning: File {audio_file} does not exist, skipping...[/bold yellow]")
progress.advance(merge_task)
continue
audio_segment = process_audio_segment(audio_file)
start_time, end_time = time_range
# Add silence segment
if i > 0:
prev_end = new_sub_times[i-1][1]
silence_duration = start_time - prev_end
if silence_duration > 0:
silence = AudioSegment.silent(duration=int(silence_duration * 1000), frame_rate=sample_rate)
merged_audio += silence
elif start_time > 0:
silence = AudioSegment.silent(duration=int(start_time * 1000), frame_rate=sample_rate)
merged_audio += silence
merged_audio += audio_segment
progress.advance(merge_task)
return merged_audio
def create_srt_subtitle():
df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK)
with open(DUB_SUB_FILE, 'w', encoding='utf-8') as f:
for i, ((start_time, end_time), line) in enumerate(zip(new_sub_times, lines), 1):
start_str = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time*1000)%1000):03d}"
end_str = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time*1000)%1000):03d}"
f.write(f"{i}\n")
f.write(f"{start_str} --> {end_str}\n")
f.write(f"{line}\n\n")
rprint(f"[bold green]✅ Subtitle file created: {DUB_SUB_FILE}[/bold green]")
def merge_full_audio():
"""Main function: Process the complete audio merging process"""
console.print("\n[bold cyan]🎬 Starting audio merging process...[/bold cyan]")
with console.status("[bold cyan]📊 Loading data from Excel...[/bold cyan]"):
df, lines, new_sub_times = load_and_flatten_data(_8_1_AUDIO_TASK)
console.print("[bold green]✅ Data loaded successfully[/bold green]")
with console.status("[bold cyan]🔍 Getting audio file list...[/bold cyan]"):
audios = get_audio_files(df)
console.print(f"[bold green]✅ Found {len(audios)} audio segments[/bold green]")
with console.status("[bold cyan]📝 Generating subtitle file...[/bold cyan]"):
create_srt_subtitle()
if not os.path.exists(audios[0]):
console.print(f"[bold red]❌ Error: First audio file {audios[0]} does not exist![/bold red]")
return
sample_rate = 16000
console.print(f"[bold green]✅ Sample rate: {sample_rate}Hz[/bold green]")
console.print("[bold cyan]🔄 Starting audio merge process...[/bold cyan]")
merged_audio = merge_audio_segments(audios, new_sub_times, sample_rate)
with console.status("[bold cyan]💾 Exporting final audio file...[/bold cyan]"):
merged_audio = merged_audio.set_frame_rate(16000).set_channels(1)
merged_audio.export(DUB_VOCAL_FILE, format="mp3", parameters=["-b:a", "64k"])
console.print(f"[bold green]✅ Audio file successfully merged![/bold green]")
console.print(f"[bold green]📁 Output file: {DUB_VOCAL_FILE}[/bold green]")
if __name__ == "__main__":
merge_full_audio()
================================================
FILE: core/_12_dub_to_vid.py
================================================
import platform
import subprocess
import cv2
import numpy as np
from rich.console import Console
from core._1_ytdlp import find_video_files
from core.asr_backend.audio_preprocess import normalize_audio_volume
from core.utils import *
from core.utils.models import *
console = Console()
DUB_VIDEO = "output/output_dub.mp4"
DUB_SUB_FILE = 'output/dub.srt'
DUB_AUDIO = 'output/dub.mp3'
TRANS_FONT_SIZE = 17
TRANS_FONT_NAME = 'Arial'
if platform.system() == 'Linux':
TRANS_FONT_NAME = 'NotoSansCJK-Regular'
if platform.system() == 'Darwin':
TRANS_FONT_NAME = 'Arial Unicode MS'
TRANS_FONT_COLOR = '&H00FFFF'
TRANS_OUTLINE_COLOR = '&H000000'
TRANS_OUTLINE_WIDTH = 1
TRANS_BACK_COLOR = '&H33000000'
def merge_video_audio():
"""Merge video and audio, and reduce video volume"""
VIDEO_FILE = find_video_files()
background_file = _BACKGROUND_AUDIO_FILE
if not load_key("burn_subtitles"):
rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as subtitles are not burned in.[/bold yellow]")
# Create a black frame
frame = np.zeros((1080, 1920, 3), dtype=np.uint8)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(DUB_VIDEO, fourcc, 1, (1920, 1080))
out.write(frame)
out.release()
rprint("[bold green]Placeholder video has been generated.[/bold green]")
return
# Normalize dub audio
normalized_dub_audio = 'output/normalized_dub.wav'
normalize_audio_volume(DUB_AUDIO, normalized_dub_audio)
# Merge video and audio with translated subtitles
video = cv2.VideoCapture(VIDEO_FILE)
TARGET_WIDTH = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
TARGET_HEIGHT = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
video.release()
rprint(f"[bold green]Video resolution: {TARGET_WIDTH}x{TARGET_HEIGHT}[/bold green]")
subtitle_filter = (
f"subtitles={DUB_SUB_FILE}:force_style='FontSize={TRANS_FONT_SIZE},"
f"FontName={TRANS_FONT_NAME},PrimaryColour={TRANS_FONT_COLOR},"
f"OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH},"
f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'"
)
cmd = [
'ffmpeg', '-y', '-i', VIDEO_FILE, '-i', background_file, '-i', normalized_dub_audio,
'-filter_complex',
f'[0:v]scale={TARGET_WIDTH}:{TARGET_HEIGHT}:force_original_aspect_ratio=decrease,'
f'pad={TARGET_WIDTH}:{TARGET_HEIGHT}:(ow-iw)/2:(oh-ih)/2,'
f'{subtitle_filter}[v];'
f'[1:a][2:a]amix=inputs=2:duration=first:dropout_transition=3[a]'
]
if load_key("ffmpeg_gpu"):
rprint("[bold green]Using GPU acceleration...[/bold green]")
cmd.extend(['-map', '[v]', '-map', '[a]', '-c:v', 'h264_nvenc'])
else:
cmd.extend(['-map', '[v]', '-map', '[a]'])
cmd.extend(['-c:a', 'aac', '-b:a', '96k', DUB_VIDEO])
subprocess.run(cmd)
rprint(f"[bold green]Video and audio successfully merged into {DUB_VIDEO}[/bold green]")
if __name__ == '__main__':
merge_video_audio()
================================================
FILE: core/_1_ytdlp.py
================================================
import os,sys
import glob
import re
import subprocess
from core.utils import *
def sanitize_filename(filename):
# Remove or replace illegal characters
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
# Ensure filename doesn't start or end with a dot or space
filename = filename.strip('. ')
# Use default name if filename is empty
return filename if filename else 'video'
def update_ytdlp():
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "yt-dlp"])
if 'yt_dlp' in sys.modules:
del sys.modules['yt_dlp']
rprint("[green]yt-dlp updated[/green]")
except subprocess.CalledProcessError as e:
rprint("[yellow]Warning: Failed to update yt-dlp: {e}[/yellow]")
from yt_dlp import YoutubeDL
return YoutubeDL
def download_video_ytdlp(url, save_path='output', resolution='1080'):
os.makedirs(save_path, exist_ok=True)
ydl_opts = {
'format': 'bestvideo+bestaudio/best' if resolution == 'best' else f'bestvideo[height<={resolution}]+bestaudio/best[height<={resolution}]',
'outtmpl': f'{save_path}/%(title)s.%(ext)s',
'noplaylist': True,
'writethumbnail': True,
'postprocessors': [{'key': 'FFmpegThumbnailsConvertor', 'format': 'jpg'}],
}
# Read Youtube Cookie File
cookies_path = load_key("youtube.cookies_path")
if os.path.exists(cookies_path):
ydl_opts["cookiefile"] = str(cookies_path)
# Get YoutubeDL class after updating
YoutubeDL = update_ytdlp()
with YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# Check and rename files after download
for file in os.listdir(save_path):
if os.path.isfile(os.path.join(save_path, file)):
filename, ext = os.path.splitext(file)
new_filename = sanitize_filename(filename)
if new_filename != filename:
os.rename(os.path.join(save_path, file), os.path.join(save_path, new_filename + ext))
def find_video_files(save_path='output'):
video_files = [file for file in glob.glob(save_path + "/*") if os.path.splitext(file)[1][1:].lower() in load_key("allowed_video_formats")]
# change \\ to /, this happen on windows
if sys.platform.startswith('win'):
video_files = [file.replace("\\", "/") for file in video_files]
video_files = [file for file in video_files if not file.startswith("output/output")]
if len(video_files) != 1:
raise ValueError(f"Number of videos found {len(video_files)} is not unique. Please check.")
return video_files[0]
if __name__ == '__main__':
# Example usage
url = input('Please enter the URL of the video you want to download: ')
resolution = input('Please enter the desired resolution (360/480/720/1080, default 1080): ')
resolution = int(resolution) if resolution.isdigit() else 1080
download_video_ytdlp(url, resolution=resolution)
print(f"🎥 Video has been downloaded to {find_video_files()}")
================================================
FILE: core/_2_asr.py
================================================
from core.utils import *
from core.asr_backend.demucs_vl import demucs_audio
from core.asr_backend.audio_preprocess import process_transcription, convert_video_to_audio, split_audio, save_results, normalize_audio_volume
from core._1_ytdlp import find_video_files
from core.utils.models import *
@check_file_exists(_2_CLEANED_CHUNKS)
def transcribe():
# 1. video to audio
video_file = find_video_files()
convert_video_to_audio(video_file)
# 2. Demucs vocal separation:
if load_key("demucs"):
demucs_audio()
vocal_audio = normalize_audio_volume(_VOCAL_AUDIO_FILE, _VOCAL_AUDIO_FILE, format="mp3")
else:
vocal_audio = _RAW_AUDIO_FILE
# 3. Extract audio
segments = split_audio(_RAW_AUDIO_FILE)
# 4. Transcribe audio by clips
all_results = []
runtime = load_key("whisper.runtime")
if runtime == "local":
from core.asr_backend.whisperX_local import transcribe_audio as ts
rprint("[cyan]🎤 Transcribing audio with local model...[/cyan]")
elif runtime == "cloud":
from core.asr_backend.whisperX_302 import transcribe_audio_302 as ts
rprint("[cyan]🎤 Transcribing audio with 302 API...[/cyan]")
elif runtime == "elevenlabs":
from core.asr_backend.elevenlabs_asr import transcribe_audio_elevenlabs as ts
rprint("[cyan]🎤 Transcribing audio with ElevenLabs API...[/cyan]")
for start, end in segments:
result = ts(_RAW_AUDIO_FILE, vocal_audio, start, end)
all_results.append(result)
# 5. Combine results
combined_result = {'segments': []}
for result in all_results:
combined_result['segments'].extend(result['segments'])
# 6. Process df
df = process_transcription(combined_result)
save_results(df)
if __name__ == "__main__":
transcribe()
================================================
FILE: core/_3_1_split_nlp.py
================================================
from core.spacy_utils import *
from core.utils.models import _3_1_SPLIT_BY_NLP
from core.utils import check_file_exists
@check_file_exists(_3_1_SPLIT_BY_NLP)
def split_by_spacy():
nlp = init_nlp()
split_by_mark(nlp)
split_by_comma_main(nlp)
split_sentences_main(nlp)
split_long_by_root_main(nlp)
return
if __name__ == '__main__':
split_by_spacy()
================================================
FILE: core/_3_2_split_meaning.py
================================================
import concurrent.futures
from difflib import SequenceMatcher
import math
from core.prompts import get_split_prompt
from core.spacy_utils.load_nlp_model import init_nlp
from core.utils import *
from rich.console import Console
from rich.table import Table
from core.utils.models import _3_1_SPLIT_BY_NLP, _3_2_SPLIT_BY_MEANING
console = Console()
def tokenize_sentence(sentence, nlp):
doc = nlp(sentence)
return [token.text for token in doc]
def find_split_positions(original, modified):
split_positions = []
parts = modified.split('[br]')
start = 0
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language
joiner = get_joiner(language)
for i in range(len(parts) - 1):
max_similarity = 0
best_split = None
for j in range(start, len(original)):
original_left = original[start:j]
modified_left = joiner.join(parts[i].split())
left_similarity = SequenceMatcher(None, original_left, modified_left).ratio()
if left_similarity > max_similarity:
max_similarity = left_similarity
best_split = j
if max_similarity < 0.9:
console.print(f"[yellow]Warning: low similarity found at the best split point: {max_similarity}[/yellow]")
if best_split is not None:
split_positions.append(best_split)
start = best_split
else:
console.print(f"[yellow]Warning: Unable to find a suitable split point for the {i+1}th part.[/yellow]")
return split_positions
def split_sentence(sentence, num_parts, word_limit=20, index=-1, retry_attempt=0):
"""Split a long sentence using GPT and return the result as a string."""
split_prompt = get_split_prompt(sentence, num_parts, word_limit)
def valid_split(response_data):
choice = response_data["choice"]
if f'split{choice}' not in response_data:
return {"status": "error", "message": "Missing required key: `split`"}
if "[br]" not in response_data[f"split{choice}"]:
return {"status": "error", "message": "Split failed, no [br] found"}
return {"status": "success", "message": "Split completed"}
response_data = ask_gpt(split_prompt + " " * retry_attempt, resp_type='json', valid_def=valid_split, log_title='split_by_meaning')
choice = response_data["choice"]
best_split = response_data[f"split{choice}"]
split_points = find_split_positions(sentence, best_split)
# split the sentence based on the split points
for i, split_point in enumerate(split_points):
if i == 0:
best_split = sentence[:split_point] + '\n' + sentence[split_point:]
else:
parts = best_split.split('\n')
last_part = parts[-1]
parts[-1] = last_part[:split_point - split_points[i-1]] + '\n' + last_part[split_point - split_points[i-1]:]
best_split = '\n'.join(parts)
if index != -1:
console.print(f'[green]✅ Sentence {index} has been successfully split[/green]')
table = Table(title="")
table.add_column("Type", style="cyan")
table.add_column("Sentence")
table.add_row("Original", sentence, style="yellow")
table.add_row("Split", best_split.replace('\n', ' ||'), style="yellow")
console.print(table)
return best_split
def parallel_split_sentences(sentences, max_length, max_workers, nlp, retry_attempt=0):
"""Split sentences in parallel using a thread pool."""
new_sentences = [None] * len(sentences)
futures = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
for index, sentence in enumerate(sentences):
# Use tokenizer to split the sentence
tokens = tokenize_sentence(sentence, nlp)
# print("Tokenization result:", tokens)
num_parts = math.ceil(len(tokens) / max_length)
if len(tokens) > max_length:
future = executor.submit(split_sentence, sentence, num_parts, max_length, index=index, retry_attempt=retry_attempt)
futures.append((future, index, num_parts, sentence))
else:
new_sentences[index] = [sentence]
for future, index, num_parts, sentence in futures:
split_result = future.result()
if split_result:
split_lines = split_result.strip().split('\n')
new_sentences[index] = [line.strip() for line in split_lines]
else:
new_sentences[index] = [sentence]
return [sentence for sublist in new_sentences for sentence in sublist]
@check_file_exists(_3_2_SPLIT_BY_MEANING)
def split_sentences_by_meaning():
"""The main function to split sentences by meaning."""
# read input sentences
with open(_3_1_SPLIT_BY_NLP, 'r', encoding='utf-8') as f:
sentences = [line.strip() for line in f.readlines()]
nlp = init_nlp()
# 🔄 process sentences multiple times to ensure all are split
for retry_attempt in range(3):
sentences = parallel_split_sentences(sentences, max_length=load_key("max_split_length"), max_workers=load_key("max_workers"), nlp=nlp, retry_attempt=retry_attempt)
# 💾 save results
with open(_3_2_SPLIT_BY_MEANING, 'w', encoding='utf-8') as f:
f.write('\n'.join(sentences))
console.print('[green]✅ All sentences have been successfully split![/green]')
if __name__ == '__main__':
# print(split_sentence('Which makes no sense to the... average guy who always pushes the character creation slider all the way to the right.', 2, 22))
split_sentences_by_meaning()
================================================
FILE: core/_4_1_summarize.py
================================================
import json
from core.prompts import get_summary_prompt
import pandas as pd
from core.utils import *
from core.utils.models import _3_2_SPLIT_BY_MEANING, _4_1_TERMINOLOGY
CUSTOM_TERMS_PATH = 'custom_terms.xlsx'
def combine_chunks():
"""Combine the text chunks identified by whisper into a single long text"""
with open(_3_2_SPLIT_BY_MEANING, 'r', encoding='utf-8') as file:
sentences = file.readlines()
cleaned_sentences = [line.strip() for line in sentences]
combined_text = ' '.join(cleaned_sentences)
return combined_text[:load_key('summary_length')] #! Return only the first x characters
def search_things_to_note_in_prompt(sentence):
"""Search for terms to note in the given sentence"""
with open(_4_1_TERMINOLOGY, 'r', encoding='utf-8') as file:
things_to_note = json.load(file)
things_to_note_list = [term['src'] for term in things_to_note['terms'] if term['src'].lower() in sentence.lower()]
if things_to_note_list:
prompt = '\n'.join(
f'{i+1}. "{term["src"]}": "{term["tgt"]}",'
f' meaning: {term["note"]}'
for i, term in enumerate(things_to_note['terms'])
if term['src'] in things_to_note_list
)
return prompt
else:
return None
def get_summary():
src_content = combine_chunks()
custom_terms = pd.read_excel(CUSTOM_TERMS_PATH)
custom_terms_json = {
"terms":
[
{
"src": str(row.iloc[0]),
"tgt": str(row.iloc[1]),
"note": str(row.iloc[2])
}
for _, row in custom_terms.iterrows()
]
}
if len(custom_terms) > 0:
rprint(f"📖 Custom Terms Loaded: {len(custom_terms)} terms")
rprint("📝 Terms Content:", json.dumps(custom_terms_json, indent=2, ensure_ascii=False))
summary_prompt = get_summary_prompt(src_content, custom_terms_json)
rprint("📝 Summarizing and extracting terminology ...")
def valid_summary(response_data):
required_keys = {'src', 'tgt', 'note'}
if 'terms' not in response_data:
return {"status": "error", "message": "Invalid response format"}
for term in response_data['terms']:
if not all(key in term for key in required_keys):
return {"status": "error", "message": "Invalid response format"}
return {"status": "success", "message": "Summary completed"}
summary = ask_gpt(summary_prompt, resp_type='json', valid_def=valid_summary, log_title='summary')
summary['terms'].extend(custom_terms_json['terms'])
with open(_4_1_TERMINOLOGY, 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=4)
rprint(f'💾 Summary log saved to → `{_4_1_TERMINOLOGY}`')
if __name__ == '__main__':
get_summary()
================================================
FILE: core/_4_2_translate.py
================================================
import pandas as pd
import json
import concurrent.futures
from core.translate_lines import translate_lines
from core._4_1_summarize import search_things_to_note_in_prompt
from core._8_1_audio_task import check_len_then_trim
from core._6_gen_sub import align_timestamp
from core.utils import *
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
from difflib import SequenceMatcher
from core.utils.models import *
console = Console()
# Function to split text into chunks
def split_chunks_by_chars(chunk_size, max_i):
"""Split text into chunks based on character count, return a list of multi-line text chunks"""
with open(_3_2_SPLIT_BY_MEANING, "r", encoding="utf-8") as file:
sentences = file.read().strip().split('\n')
chunks = []
chunk = ''
sentence_count = 0
for sentence in sentences:
if len(chunk) + len(sentence + '\n') > chunk_size or sentence_count == max_i:
chunks.append(chunk.strip())
chunk = sentence + '\n'
sentence_count = 1
else:
chunk += sentence + '\n'
sentence_count += 1
chunks.append(chunk.strip())
return chunks
# Get context from surrounding chunks
def get_previous_content(chunks, chunk_index):
return None if chunk_index == 0 else chunks[chunk_index - 1].split('\n')[-3:] # Get last 3 lines
def get_after_content(chunks, chunk_index):
return None if chunk_index == len(chunks) - 1 else chunks[chunk_index + 1].split('\n')[:2] # Get first 2 lines
# 🔍 Translate a single chunk
def translate_chunk(chunk, chunks, theme_prompt, i):
things_to_note_prompt = search_things_to_note_in_prompt(chunk)
previous_content_prompt = get_previous_content(chunks, i)
after_content_prompt = get_after_content(chunks, i)
translation, english_result = translate_lines(chunk, previous_content_prompt, after_content_prompt, things_to_note_prompt, theme_prompt, i)
return i, english_result, translation
# Add similarity calculation function
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
# 🚀 Main function to translate all chunks
@check_file_exists(_4_2_TRANSLATION)
def translate_all():
console.print("[bold green]Start Translating All...[/bold green]")
chunks = split_chunks_by_chars(chunk_size=600, max_i=10)
with open(_4_1_TERMINOLOGY, 'r', encoding='utf-8') as file:
theme_prompt = json.load(file).get('theme')
# 🔄 Use concurrent execution for translation
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), transient=True) as progress:
task = progress.add_task("[cyan]Translating chunks...", total=len(chunks))
with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
futures = []
for i, chunk in enumerate(chunks):
future = executor.submit(translate_chunk, chunk, chunks, theme_prompt, i)
futures.append(future)
results = []
for future in concurrent.futures.as_completed(futures):
results.append(future.result())
progress.update(task, advance=1)
results.sort(key=lambda x: x[0]) # Sort results based on original order
# 💾 Save results to lists and Excel file
src_text, trans_text = [], []
for i, chunk in enumerate(chunks):
chunk_lines = chunk.split('\n')
src_text.extend(chunk_lines)
# Calculate similarity between current chunk and translation results
chunk_text = ''.join(chunk_lines).lower()
matching_results = [(r, similar(''.join(r[1].split('\n')).lower(), chunk_text))
for r in results]
best_match = max(matching_results, key=lambda x: x[1])
# Check similarity and handle exceptions
if best_match[1] < 0.9:
console.print(f"[yellow]Warning: No matching translation found for chunk {i}[/yellow]")
raise ValueError(f"Translation matching failed (chunk {i})")
elif best_match[1] < 1.0:
console.print(f"[yellow]Warning: Similar match found (chunk {i}, similarity: {best_match[1]:.3f})[/yellow]")
trans_text.extend(best_match[0][2].split('\n'))
# Trim long translation text
df_text = pd.read_excel(_2_CLEANED_CHUNKS)
df_text['text'] = df_text['text'].str.strip('"').str.strip()
df_translate = pd.DataFrame({'Source': src_text, 'Translation': trans_text})
subtitle_output_configs = [('trans_subs_for_audio.srt', ['Translation'])]
df_time = align_timestamp(df_text, df_translate, subtitle_output_configs, output_dir=None, for_display=False)
console.print(df_time)
# apply check_len_then_trim to df_time['Translation'], only when duration > MIN_TRIM_DURATION.
df_time['Translation'] = df_time.apply(lambda x: check_len_then_trim(x['Translation'], x['duration']) if x['duration'] > load_key("min_trim_duration") else x['Translation'], axis=1)
console.print(df_time)
df_time.to_excel(_4_2_TRANSLATION, index=False)
console.print("[bold green]✅ Translation completed and results saved.[/bold green]")
if __name__ == '__main__':
translate_all()
================================================
FILE: core/_5_split_sub.py
================================================
import pandas as pd
from typing import List, Tuple
import concurrent.futures
from core._3_2_split_meaning import split_sentence
from core.prompts import get_align_prompt
from rich.panel import Panel
from rich.console import Console
from rich.table import Table
from core.utils import *
from core.utils.models import *
console = Console()
# ! You can modify your own weights here
# Chinese and Japanese 2.5 characters, Korean 2 characters, Thai 1.5 characters, full-width symbols 2 characters, other English-based and half-width symbols 1 character
def calc_len(text: str) -> float:
text = str(text) # force convert
def char_weight(char):
code = ord(char)
if 0x4E00 <= code <= 0x9FFF or 0x3040 <= code <= 0x30FF: # Chinese and Japanese
return 1.75
elif 0xAC00 <= code <= 0xD7A3 or 0x1100 <= code <= 0x11FF: # Korean
return 1.5
elif 0x0E00 <= code <= 0x0E7F: # Thai
return 1
elif 0xFF01 <= code <= 0xFF5E: # full-width symbols
return 1.75
else: # other characters (e.g. English and half-width symbols)
return 1
return sum(char_weight(char) for char in text)
def align_subs(src_sub: str, tr_sub: str, src_part: str) -> Tuple[List[str], List[str], str]:
align_prompt = get_align_prompt(src_sub, tr_sub, src_part)
def valid_align(response_data):
if 'align' not in response_data:
return {"status": "error", "message": "Missing required key: `align`"}
if len(response_data['align']) < 2:
return {"status": "error", "message": "Align does not contain more than 1 part as expected!"}
return {"status": "success", "message": "Align completed"}
parsed = ask_gpt(align_prompt, resp_type='json', valid_def=valid_align, log_title='align_subs')
align_data = parsed['align']
src_parts = src_part.split('\n')
tr_parts = [item[f'target_part_{i+1}'].strip() for i, item in enumerate(align_data)]
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language
joiner = get_joiner(language)
tr_remerged = joiner.join(tr_parts)
table = Table(title="🔗 Aligned parts")
table.add_column("Language", style="cyan")
table.add_column("Parts", style="magenta")
table.add_row("SRC_LANG", "\n".join(src_parts))
table.add_row("TARGET_LANG", "\n".join(tr_parts))
console.print(table)
return src_parts, tr_parts, tr_remerged
def split_align_subs(src_lines: List[str], tr_lines: List[str]):
subtitle_set = load_key("subtitle")
MAX_SUB_LENGTH = subtitle_set["max_length"]
TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"]
remerged_tr_lines = tr_lines.copy()
to_split = []
for i, (src, tr) in enumerate(zip(src_lines, tr_lines)):
src, tr = str(src), str(tr)
if len(src) > MAX_SUB_LENGTH or calc_len(tr) * TARGET_SUB_MULTIPLIER > MAX_SUB_LENGTH:
to_split.append(i)
table = Table(title=f"📏 Line {i} needs to be split")
table.add_column("Type", style="cyan")
table.add_column("Content", style="magenta")
table.add_row("Source Line", src)
table.add_row("Target Line", tr)
console.print(table)
@except_handler("Error in split_align_subs")
def process(i):
split_src = split_sentence(src_lines[i], num_parts=2).strip()
src_parts, tr_parts, tr_remerged = align_subs(src_lines[i], tr_lines[i], split_src)
src_lines[i] = src_parts
tr_lines[i] = tr_parts
remerged_tr_lines[i] = tr_remerged
with concurrent.futures.ThreadPoolExecutor(max_workers=load_key("max_workers")) as executor:
executor.map(process, to_split)
# Flatten `src_lines` and `tr_lines`
src_lines = [item for sublist in src_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
tr_lines = [item for sublist in tr_lines for item in (sublist if isinstance(sublist, list) else [sublist])]
return src_lines, tr_lines, remerged_tr_lines
def split_for_sub_main():
console.print("[bold green]🚀 Start splitting subtitles...[/bold green]")
df = pd.read_excel(_4_2_TRANSLATION)
src = df['Source'].tolist()
trans = df['Translation'].tolist()
subtitle_set = load_key("subtitle")
MAX_SUB_LENGTH = subtitle_set["max_length"]
TARGET_SUB_MULTIPLIER = subtitle_set["target_multiplier"]
for attempt in range(3): # 多次切割
console.print(Panel(f"🔄 Split attempt {attempt + 1}", expand=False))
split_src, split_trans, remerged = split_align_subs(src.copy(), trans)
# 检查是否所有字幕都符合长度要求
if all(len(src) <= MAX_SUB_LENGTH for src in split_src) and \
all(calc_len(tr) * TARGET_SUB_MULTIPLIER <= MAX_SUB_LENGTH for tr in split_trans):
break
# 更新源数据继续下一轮分割
src, trans = split_src, split_trans
# 确保二者有相同的长度,防止报错
if len(src) > len(remerged):
remerged += [None] * (len(src) - len(remerged))
elif len(remerged) > len(src):
src += [None] * (len(remerged) - len(src))
pd.DataFrame({'Source': split_src, 'Translation': split_trans}).to_excel(_5_SPLIT_SUB, index=False)
pd.DataFrame({'Source': src, 'Translation': remerged}).to_excel(_5_REMERGED, index=False)
if __name__ == '__main__':
split_for_sub_main()
================================================
FILE: core/_6_gen_sub.py
================================================
import pandas as pd
import os
import re
from rich.panel import Panel
from rich.console import Console
import autocorrect_py as autocorrect
from core.utils import *
from core.utils.models import *
console = Console()
SUBTITLE_OUTPUT_CONFIGS = [
('src.srt', ['Source']),
('trans.srt', ['Translation']),
('src_trans.srt', ['Source', 'Translation']),
('trans_src.srt', ['Translation', 'Source'])
]
AUDIO_SUBTITLE_OUTPUT_CONFIGS = [
('src_subs_for_audio.srt', ['Source']),
('trans_subs_for_audio.srt', ['Translation'])
]
def convert_to_srt_format(start_time, end_time):
"""Convert time (in seconds) to the format: hours:minutes:seconds,milliseconds"""
def seconds_to_hmsm(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = seconds % 60
milliseconds = int(seconds * 1000) % 1000
return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"
start_srt = seconds_to_hmsm(start_time)
end_srt = seconds_to_hmsm(end_time)
return f"{start_srt} --> {end_srt}"
def remove_punctuation(text):
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
return text.strip()
def show_difference(str1, str2):
"""Show the difference positions between two strings"""
min_len = min(len(str1), len(str2))
diff_positions = []
for i in range(min_len):
if str1[i] != str2[i]:
diff_positions.append(i)
if len(str1) != len(str2):
diff_positions.extend(range(min_len, max(len(str1), len(str2))))
print("Difference positions:")
print(f"Expected sentence: {str1}")
print(f"Actual match: {str2}")
print("Position markers: " + "".join("^" if i in diff_positions else " " for i in range(max(len(str1), len(str2)))))
print(f"Difference indices: {diff_positions}")
def get_sentence_timestamps(df_words, df_sentences):
time_stamp_list = []
# Build complete string and position mapping
full_words_str = ''
position_to_word_idx = {}
for idx, word in enumerate(df_words['text']):
clean_word = remove_punctuation(word.lower())
start_pos = len(full_words_str)
full_words_str += clean_word
for pos in range(start_pos, len(full_words_str)):
position_to_word_idx[pos] = idx
current_pos = 0
for idx, sentence in df_sentences['Source'].items():
clean_sentence = remove_punctuation(sentence.lower()).replace(" ", "")
sentence_len = len(clean_sentence)
match_found = False
while current_pos <= len(full_words_str) - sentence_len:
if full_words_str[current_pos:current_pos+sentence_len] == clean_sentence:
start_word_idx = position_to_word_idx[current_pos]
end_word_idx = position_to_word_idx[current_pos + sentence_len - 1]
time_stamp_list.append((
float(df_words['start'][start_word_idx]),
float(df_words['end'][end_word_idx])
))
current_pos += sentence_len
match_found = True
break
current_pos += 1
if not match_found:
print(f"\n⚠️ Warning: No exact match found for sentence: {sentence}")
show_difference(clean_sentence,
full_words_str[current_pos:current_pos+len(clean_sentence)])
print("\nOriginal sentence:", df_sentences['Source'][idx])
raise ValueError("❎ No match found for sentence.")
return time_stamp_list
def align_timestamp(df_text, df_translate, subtitle_output_configs: list, output_dir: str, for_display: bool = True):
"""Align timestamps and add a new timestamp column to df_translate"""
df_trans_time = df_translate.copy()
# Assign an ID to each word in df_text['text'] and create a new DataFrame
words = df_text['text'].str.split(expand=True).stack().reset_index(level=1, drop=True).reset_index()
words.columns = ['id', 'word']
words['id'] = words['id'].astype(int)
# Process timestamps ⏰
time_stamp_list = get_sentence_timestamps(df_text, df_translate)
df_trans_time['timestamp'] = time_stamp_list
df_trans_time['duration'] = df_trans_time['timestamp'].apply(lambda x: x[1] - x[0])
# Remove gaps 🕳️
for i in range(len(df_trans_time)-1):
delta_time = df_trans_time.loc[i+1, 'timestamp'][0] - df_trans_time.loc[i, 'timestamp'][1]
if 0 < delta_time < 1:
df_trans_time.at[i, 'timestamp'] = (df_trans_time.loc[i, 'timestamp'][0], df_trans_time.loc[i+1, 'timestamp'][0])
# Convert start and end timestamps to SRT format
df_trans_time['timestamp'] = df_trans_time['timestamp'].apply(lambda x: convert_to_srt_format(x[0], x[1]))
# Polish subtitles: replace punctuation in Translation if for_display
if for_display:
df_trans_time['Translation'] = df_trans_time['Translation'].apply(lambda x: re.sub(r'[,。]', ' ', x).strip())
# Output subtitles 📜
def generate_subtitle_string(df, columns):
return ''.join([f"{i+1}\n{row['timestamp']}\n{row[columns[0]].strip()}\n{row[columns[1]].strip() if len(columns) > 1 else ''}\n\n" for i, row in df.iterrows()]).strip()
if output_dir:
os.makedirs(output_dir, exist_ok=True)
for filename, columns in subtitle_output_configs:
subtitle_str = generate_subtitle_string(df_trans_time, columns)
with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
f.write(subtitle_str)
return df_trans_time
# ✨ Beautify the translation
def clean_translation(x):
if pd.isna(x):
return ''
cleaned = str(x).strip('。').strip(',')
return autocorrect.format(cleaned)
def align_timestamp_main():
df_text = pd.read_excel(_2_CLEANED_CHUNKS)
df_text['text'] = df_text['text'].str.strip('"').str.strip()
df_translate = pd.read_excel(_5_SPLIT_SUB)
df_translate['Translation'] = df_translate['Translation'].apply(clean_translation)
align_timestamp(df_text, df_translate, SUBTITLE_OUTPUT_CONFIGS, _OUTPUT_DIR)
console.print(Panel("[bold green]🎉📝 Subtitles generation completed! Please check in the `output` folder 👀[/bold green]"))
# for audio
df_translate_for_audio = pd.read_excel(_5_REMERGED) # use remerged file to avoid unmatched lines when dubbing
df_translate_for_audio['Translation'] = df_translate_for_audio['Translation'].apply(clean_translation)
align_timestamp(df_text, df_translate_for_audio, AUDIO_SUBTITLE_OUTPUT_CONFIGS, _AUDIO_DIR)
console.print(Panel(f"[bold green]🎉📝 Audio subtitles generation completed! Please check in the `{_AUDIO_DIR}` folder 👀[/bold green]"))
if __name__ == '__main__':
align_timestamp_main()
================================================
FILE: core/_7_sub_into_vid.py
================================================
import os, subprocess, time
from core._1_ytdlp import find_video_files
import cv2
import numpy as np
import platform
from core.utils import *
SRC_FONT_SIZE = 15
TRANS_FONT_SIZE = 17
FONT_NAME = 'Arial'
TRANS_FONT_NAME = 'Arial'
# Linux need to install google noto fonts: apt-get install fonts-noto
if platform.system() == 'Linux':
FONT_NAME = 'NotoSansCJK-Regular'
TRANS_FONT_NAME = 'NotoSansCJK-Regular'
# Mac OS has different font names
elif platform.system() == 'Darwin':
FONT_NAME = 'Arial Unicode MS'
TRANS_FONT_NAME = 'Arial Unicode MS'
SRC_FONT_COLOR = '&HFFFFFF'
SRC_OUTLINE_COLOR = '&H000000'
SRC_OUTLINE_WIDTH = 1
SRC_SHADOW_COLOR = '&H80000000'
TRANS_FONT_COLOR = '&H00FFFF'
TRANS_OUTLINE_COLOR = '&H000000'
TRANS_OUTLINE_WIDTH = 1
TRANS_BACK_COLOR = '&H33000000'
OUTPUT_DIR = "output"
OUTPUT_VIDEO = f"{OUTPUT_DIR}/output_sub.mp4"
SRC_SRT = f"{OUTPUT_DIR}/src.srt"
TRANS_SRT = f"{OUTPUT_DIR}/trans.srt"
def check_gpu_available():
try:
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True)
return 'h264_nvenc' in result.stdout
except:
return False
def merge_subtitles_to_video():
video_file = find_video_files()
os.makedirs(os.path.dirname(OUTPUT_VIDEO), exist_ok=True)
# Check resolution
if not load_key("burn_subtitles"):
rprint("[bold yellow]Warning: A 0-second black video will be generated as a placeholder as subtitles are not burned in.[/bold yellow]")
# Create a black frame
frame = np.zeros((1080, 1920, 3), dtype=np.uint8)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 1, (1920, 1080))
out.write(frame)
out.release()
rprint("[bold green]Placeholder video has been generated.[/bold green]")
return
if not os.path.exists(SRC_SRT) or not os.path.exists(TRANS_SRT):
rprint("Subtitle files not found in the 'output' directory.")
exit(1)
video = cv2.VideoCapture(video_file)
TARGET_WIDTH = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
TARGET_HEIGHT = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
video.release()
rprint(f"[bold green]Video resolution: {TARGET_WIDTH}x{TARGET_HEIGHT}[/bold green]")
ffmpeg_cmd = [
'ffmpeg', '-i', video_file,
'-vf', (
f"scale={TARGET_WIDTH}:{TARGET_HEIGHT}:force_original_aspect_ratio=decrease,"
f"pad={TARGET_WIDTH}:{TARGET_HEIGHT}:(ow-iw)/2:(oh-ih)/2,"
f"subtitles={SRC_SRT}:force_style='FontSize={SRC_FONT_SIZE},FontName={FONT_NAME},"
f"PrimaryColour={SRC_FONT_COLOR},OutlineColour={SRC_OUTLINE_COLOR},OutlineWidth={SRC_OUTLINE_WIDTH},"
f"ShadowColour={SRC_SHADOW_COLOR},BorderStyle=1',"
f"subtitles={TRANS_SRT}:force_style='FontSize={TRANS_FONT_SIZE},FontName={TRANS_FONT_NAME},"
f"PrimaryColour={TRANS_FONT_COLOR},OutlineColour={TRANS_OUTLINE_COLOR},OutlineWidth={TRANS_OUTLINE_WIDTH},"
f"BackColour={TRANS_BACK_COLOR},Alignment=2,MarginV=27,BorderStyle=4'"
).encode('utf-8'),
]
ffmpeg_gpu = load_key("ffmpeg_gpu")
if ffmpeg_gpu:
rprint("[bold green]will use GPU acceleration.[/bold green]")
ffmpeg_cmd.extend(['-c:v', 'h264_nvenc'])
ffmpeg_cmd.extend(['-y', OUTPUT_VIDEO])
rprint("🎬 Start merging subtitles to video...")
start_time = time.time()
process = subprocess.Popen(ffmpeg_cmd)
try:
process.wait()
if process.returncode == 0:
rprint(f"\n✅ Done! Time taken: {time.time() - start_time:.2f} seconds")
else:
rprint("\n❌ FFmpeg execution error")
except Exception as e:
rprint(f"\n❌ Error occurred: {e}")
if process.poll() is None:
process.kill()
if __name__ == "__main__":
merge_subtitles_to_video()
================================================
FILE: core/_8_1_audio_task.py
================================================
import datetime
import re
import pandas as pd
from rich.console import Console
from rich.panel import Panel
from core.prompts import get_subtitle_trim_prompt
from core.tts_backend.estimate_duration import init_estimator, estimate_duration
from core.utils import *
from core.utils.models import *
console = Console()
speed_factor = load_key("speed_factor")
TRANS_SUBS_FOR_AUDIO_FILE = 'output/audio/trans_subs_for_audio.srt'
SRC_SUBS_FOR_AUDIO_FILE = 'output/audio/src_subs_for_audio.srt'
ESTIMATOR = None
def check_len_then_trim(text, duration):
global ESTIMATOR
if ESTIMATOR is None:
ESTIMATOR = init_estimator()
estimated_duration = estimate_duration(text, ESTIMATOR) / speed_factor['max']
console.print(f"Subtitle text: {text}, "
f"[bold green]Estimated reading duration: {estimated_duration:.2f} seconds[/bold green]")
if estimated_duration > duration:
rprint(Panel(f"Estimated reading duration {estimated_duration:.2f} seconds exceeds given duration {duration:.2f} seconds, shortening...", title="Processing", border_style="yellow"))
original_text = text
prompt = get_subtitle_trim_prompt(text, duration)
def valid_trim(response):
if 'result' not in response:
return {'status': 'error', 'message': 'No result in response'}
return {'status': 'success', 'message': ''}
try:
response = ask_gpt(prompt, resp_type='json', log_title='sub_trim', valid_def=valid_trim)
shortened_text = response['result']
except Exception:
rprint("[bold red]🚫 AI refused to answer due to sensitivity, so manually remove punctuation[/bold red]")
shortened_text = re.sub(r'[,.!?;:,。!?;:]', ' ', text).strip()
rprint(Panel(f"Subtitle before shortening: {original_text}\nSubtitle after shortening: {shortened_text}", title="Subtitle Shortening Result", border_style="green"))
return shortened_text
else:
return text
def time_diff_seconds(t1, t2, base_date):
"""Calculate the difference in seconds between two time objects"""
dt1 = datetime.datetime.combine(base_date, t1)
dt2 = datetime.datetime.combine(base_date, t2)
return (dt2 - dt1).total_seconds()
def process_srt():
"""Process srt file, generate audio tasks"""
with open(TRANS_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as file:
content = file.read()
with open(SRC_SUBS_FOR_AUDIO_FILE, 'r', encoding='utf-8') as src_file:
src_content = src_file.read()
subtitles = []
src_subtitles = {}
for block in src_content.strip().split('\n\n'):
lines = [line.strip() for line in block.split('\n') if line.strip()]
if len(lines) < 3:
continue
number = int(lines[0])
src_text = ' '.join(lines[2:])
src_subtitles[number] = src_text
for block in content.strip().split('\n\n'):
lines = [line.strip() for line in block.split('\n') if line.strip()]
if len(lines) < 3:
continue
try:
number = int(lines[0])
start_time, end_time = lines[1].split(' --> ')
start_time = datetime.datetime.strptime(start_time, '%H:%M:%S,%f').time()
end_time = datetime.datetime.strptime(end_time, '%H:%M:%S,%f').time()
duration = time_diff_seconds(start_time, end_time, datetime.date.today())
text = ' '.join(lines[2:])
# Remove content within parentheses (including English and Chinese parentheses)
text = re.sub(r'\([^)]*\)', '', text).strip()
text = re.sub(r'([^)]*)', '', text).strip()
# Remove '-' character, can continue to add illegal characters that cause errors
text = text.replace('-', '')
# Add the original text from src_subs_for_audio.srt
origin = src_subtitles.get(number, '')
except ValueError as e:
rprint(Panel(f"Unable to parse subtitle block '{block}', error: {str(e)}, skipping this subtitle block.", title="Error", border_style="red"))
continue
subtitles.append({'number': number, 'start_time': start_time, 'end_time': end_time, 'duration': duration, 'text': text, 'origin': origin})
df = pd.DataFrame(subtitles)
i = 0
MIN_SUB_DUR = load_key("min_subtitle_duration")
while i < len(df):
today = datetime.date.today()
if df.loc[i, 'duration'] < MIN_SUB_DUR:
if i < len(df) - 1 and time_diff_seconds(df.loc[i, 'start_time'],df.loc[i+1, 'start_time'],today) < MIN_SUB_DUR:
rprint(f"[bold yellow]Merging subtitles {i+1} and {i+2}[/bold yellow]")
df.loc[i, 'text'] += ' ' + df.loc[i+1, 'text']
df.loc[i, 'origin'] += ' ' + df.loc[i+1, 'origin']
df.loc[i, 'end_time'] = df.loc[i+1, 'end_time']
df.loc[i, 'duration'] = time_diff_seconds(df.loc[i, 'start_time'],df.loc[i, 'end_time'],today)
df = df.drop(i+1).reset_index(drop=True)
else:
if i < len(df) - 1: # Not the last audio
rprint(f"[bold blue]Extending subtitle {i+1} duration to {MIN_SUB_DUR} seconds[/bold blue]")
df.loc[i, 'end_time'] = (datetime.datetime.combine(today, df.loc[i, 'start_time']) +
datetime.timedelta(seconds=MIN_SUB_DUR)).time()
df.loc[i, 'duration'] = MIN_SUB_DUR
else:
rprint(f"[bold red]The last subtitle {i+1} duration is less than {MIN_SUB_DUR} seconds, but not extending[/bold red]")
i += 1
else:
i += 1
df['start_time'] = df['start_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3])
df['end_time'] = df['end_time'].apply(lambda x: x.strftime('%H:%M:%S.%f')[:-3])
##! No longer perform secondary trim
# check and trim subtitle length, for twice to ensure the subtitle length is within the limit, 允许tolerance
# df['text'] = df.apply(lambda x: check_len_then_trim(x['text'], x['duration']+x['tolerance']), axis=1)
return df
@check_file_exists(_8_1_AUDIO_TASK)
def gen_audio_task_main():
df = process_srt()
console.print(df)
df.to_excel(_8_1_AUDIO_TASK, index=False)
rprint(Panel(f"Successfully generated {_8_1_AUDIO_TASK}", title="Success", border_style="green"))
if __name__ == '__main__':
gen_audio_task_main()
================================================
FILE: core/_8_2_dub_chunks.py
================================================
import datetime
import re
import pandas as pd
from core._8_1_audio_task import time_diff_seconds
from core.asr_backend.audio_preprocess import get_audio_duration
from core.tts_backend.estimate_duration import init_estimator, estimate_duration
from core.utils import *
from core.utils.models import *
SRC_SRT = "output/src.srt"
TRANS_SRT = "output/trans.srt"
MAX_MERGE_COUNT = 5
ESTIMATOR = None
def calc_if_too_fast(est_dur, tol_dur, duration, tolerance):
accept = load_key("speed_factor.accept") # Maximum acceptable speed factor
if est_dur / accept > tol_dur: # Even max speed factor cannot adapt
return 2
elif est_dur > tol_dur: # Speed adjustment needed within acceptable range
return 1
elif est_dur < duration - tolerance: # Speaking speed too slow
return -1
else: # Normal speaking speed
return 0
def merge_rows(df, start_idx, merge_count):
"""Merge multiple rows and calculate cumulative values"""
merged = {
'est_dur': df.iloc[start_idx]['est_dur'],
'tol_dur': df.iloc[start_idx]['tol_dur'],
'duration': df.iloc[start_idx]['duration']
}
while merge_count < MAX_MERGE_COUNT and (start_idx + merge_count) < len(df):
next_row = df.iloc[start_idx + merge_count]
merged['est_dur'] += next_row['est_dur']
merged['tol_dur'] += next_row['tol_dur']
merged['duration'] += next_row['duration']
speed_flag = calc_if_too_fast(
merged['est_dur'],
merged['tol_dur'],
merged['duration'],
df.iloc[start_idx + merge_count]['tolerance']
)
if speed_flag <= 0 or merge_count == 2:
df.at[start_idx + merge_count, 'cut_off'] = 1
return merge_count + 1
merge_count += 1
# If no suitable merge point is found
if merge_count >= MAX_MERGE_COUNT or (start_idx + merge_count) >= len(df):
df.at[start_idx + merge_count - 1, 'cut_off'] = 1
return merge_count
def analyze_subtitle_timing_and_speed(df):
rprint("[🔍 Analyzing] Calculating subtitle timing and speed...")
global ESTIMATOR
if ESTIMATOR is None:
ESTIMATOR = init_estimator()
TOLERANCE = load_key("tolerance")
whole_dur = get_audio_duration(_RAW_AUDIO_FILE)
df['gap'] = 0.0 # Initialize gap column
for i in range(len(df) - 1):
current_end = datetime.datetime.strptime(df.loc[i, 'end_time'], '%H:%M:%S.%f').time()
next_start = datetime.datetime.strptime(df.loc[i + 1, 'start_time'], '%H:%M:%S.%f').time()
df.loc[i, 'gap'] = time_diff_seconds(current_end, next_start, datetime.date.today())
# Set the gap for the last line
last_end = datetime.datetime.strptime(df.iloc[-1]['end_time'], '%H:%M:%S.%f').time()
last_end_seconds = (last_end.hour * 3600 + last_end.minute * 60 +
last_end.second + last_end.microsecond / 1000000)
df.iloc[-1, df.columns.get_loc('gap')] = whole_dur - last_end_seconds
df['tolerance'] = df['gap'].apply(lambda x: TOLERANCE if x > TOLERANCE else x)
df['tol_dur'] = df['duration'] + df['tolerance']
df['est_dur'] = df.apply(lambda x: estimate_duration(x['text'], ESTIMATOR), axis=1)
## Calculate speed indicators
accept = load_key("speed_factor.accept") # Maximum acceptable speed factor
def calc_if_too_fast(row):
est_dur = row['est_dur']
tol_dur = row['tol_dur']
duration = row['duration']
tolerance = row['tolerance']
if est_dur / accept > tol_dur: # Even max speed factor cannot adapt
return 2
elif est_dur > tol_dur: # Speed adjustment needed within acceptable range
return 1
elif est_dur < duration - tolerance: # Speaking speed too slow
return -1
else: # Normal speaking speed
return 0
df['if_too_fast'] = df.apply(calc_if_too_fast, axis=1)
return df
def process_cutoffs(df):
rprint("[✂️ Processing] Generating cutoff points...")
df['cut_off'] = 0 # Initialize cut_off column
df.loc[df['gap'] >= load_key("tolerance"), 'cut_off'] = 1 # Set to 1 when gap is greater than TOLERANCE
idx = 0
while idx < len(df):
# Process marked split points
if df.iloc[idx]['cut_off'] == 1:
if df.iloc[idx]['if_too_fast'] == 2:
rprint(f"[⚠️ Warning] Line {idx} is too fast and cannot be fixed by speed adjustment")
idx += 1
continue
# Process the last line
if idx + 1 >= len(df):
df.at[idx, 'cut_off'] = 1
break
# Process normal or slow lines
if df.iloc[idx]['if_too_fast'] <= 0:
if df.iloc[idx + 1]['if_too_fast'] <= 0:
df.at[idx, 'cut_off'] = 1
idx += 1
else:
idx += merge_rows(df, idx, 1)
# Process fast lines
else:
idx += merge_rows(df, idx, 1)
return df
def gen_dub_chunks():
rprint("[🎬 Starting] Generating dubbing chunks...")
df = pd.read_excel(_8_1_AUDIO_TASK)
rprint("[📊 Processing] Analyzing timing and speed...")
df = analyze_subtitle_timing_and_speed(df)
rprint("[✂️ Processing] Processing cutoffs...")
df = process_cutoffs(df)
rprint("[📝 Reading] Loading transcript files...")
content = open(TRANS_SRT, "r", encoding="utf-8").read()
ori_content = open(SRC_SRT, "r", encoding="utf-8").read()
# Process subtitle content
content_lines = []
ori_content_lines = []
# Process translated subtitles
for block in content.strip().split('\n\n'):
lines = [line.strip() for line in block.split('\n') if line.strip()]
if len(lines) >= 3:
text = ' '.join(lines[2:])
text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '')
content_lines.append(text)
# Process source subtitles (same structure)
for block in ori_content.strip().split('\n\n'):
lines = [line.strip() for line in block.split('\n') if line.strip()]
if len(lines) >= 3:
text = ' '.join(lines[2:])
text = re.sub(r'\([^)]*\)|([^)]*)', '', text).strip().replace('-', '')
ori_content_lines.append(text)
# Match processing
df['lines'] = None
df['src_lines'] = None
last_idx = 0
def clean_text(text):
"""clean space and punctuation"""
if not text or not isinstance(text, str):
return ''
return re.sub(r'[^\w\s]|[\s]', '', text)
for idx, row in df.iterrows():
target = clean_text(row['text'])
matches = []
current = ''
match_indices = [] # Store indices for matching lines
for i in range(last_idx, len(content_lines)):
line = content_lines[i]
cleaned_line = clean_text(line)
current += cleaned_line
matches.append(line) # 存储原始文本
match_indices.append(i)
if current == target:
df.at[idx, 'lines'] = matches
df.at[idx, 'src_lines'] = [ori_content_lines[i] for i in match_indices]
last_idx = i + 1
break
else: # If no match is found
rprint(f"[❌ Error] Matching failed at line {idx}:")
rprint(f"Target: '{target}'")
rprint(f"Current: '{current}'")
raise ValueError("Matching failed")
# Save results
df.to_excel(_8_1_AUDIO_TASK, index=False)
rprint("[✅ Complete] Matching completed successfully!")
if __name__ == "__main__":
gen_dub_chunks()
================================================
FILE: core/_9_refer_audio.py
================================================
import os
from rich.panel import Panel
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
from core.utils import *
from core.utils.models import *
import pandas as pd
import soundfile as sf
console = Console()
from core.asr_backend.demucs_vl import demucs_audio
from core.utils.models import *
def time_to_samples(time_str, sr):
"""Unified time conversion function"""
h, m, s = time_str.split(':')
s, ms = s.split(',') if ',' in s else (s, '0')
seconds = int(h) * 3600 + int(m) * 60 + float(s) + float(ms) / 1000
return int(seconds * sr)
def extract_audio(audio_data, sr, start_time, end_time, out_file):
"""Simplified audio extraction function"""
start = time_to_samples(start_time, sr)
end = time_to_samples(end_time, sr)
sf.write(out_file, audio_data[start:end], sr)
def extract_refer_audio_main():
demucs_audio() #!!! in case demucs not run
if os.path.exists(os.path.join(_AUDIO_SEGS_DIR, '1.wav')):
rprint(Panel("Audio segments already exist, skipping extraction", title="Info", border_style="blue"))
return
# Create output directory
os.makedirs(_AUDIO_REFERS_DIR, exist_ok=True)
# Read task file and audio data
df = pd.read_excel(_8_1_AUDIO_TASK)
data, sr = sf.read(_VOCAL_AUDIO_FILE)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
) as progress:
task = progress.add_task("Extracting audio segments...", total=len(df))
for _, row in df.iterrows():
out_file = os.path.join(_AUDIO_REFERS_DIR, f"{row['number']}.wav")
extract_audio(data, sr, row['start_time'], row['end_time'], out_file)
progress.update(task, advance=1)
rprint(Panel(f"Audio segments saved to {_AUDIO_REFERS_DIR}", title="Success", border_style="green"))
if __name__ == "__main__":
extract_refer_audio_main()
================================================
FILE: core/__init__.py
================================================
# use try-except to avoid error when installing
try:
from . import (
_1_ytdlp,
_2_asr,
_3_1_split_nlp,
_3_2_split_meaning,
_4_1_summarize,
_4_2_translate,
_5_split_sub,
_6_gen_sub,
_7_sub_into_vid,
_8_1_audio_task,
_8_2_dub_chunks,
_9_refer_audio,
_10_gen_audio,
_11_merge_audio,
_12_dub_to_vid
)
from .utils import *
from .utils.onekeycleanup import cleanup
from .utils.delete_retry_dubbing import delete_dubbing_files
except ImportError:
pass
__all__ = [
'ask_gpt',
'load_key',
'update_key',
'cleanup',
'delete_dubbing_files',
'_1_ytdlp',
'_2_asr',
'_3_1_split_nlp',
'_3_2_split_meaning',
'_4_1_summarize',
'_4_2_translate',
'_5_split_sub',
'_6_gen_sub',
'_7_sub_into_vid',
'_8_1_audio_task',
'_8_2_dub_chunks',
'_9_refer_audio',
'_10_gen_audio',
'_11_merge_audio',
'_12_dub_to_vid'
]
================================================
FILE: core/asr_backend/__init__.py
================================================
================================================
FILE: core/asr_backend/audio_preprocess.py
================================================
import os, subprocess
import pandas as pd
from typing import Dict, List, Tuple
from pydub import AudioSegment
from core.utils import *
from core.utils.models import *
from pydub import AudioSegment
from pydub.silence import detect_silence
from pydub.utils import mediainfo
from rich import print as rprint
def _ffmpeg_has_encoder(encoder_name: str) -> bool:
"""Check if the current ffmpeg installation supports a given audio encoder."""
try:
result = subprocess.run(
['ffmpeg', '-encoders'], capture_output=True, text=True, timeout=10
)
return encoder_name in result.stdout
except Exception:
return False
def normalize_audio_volume(audio_path, output_path, target_db = -20.0, format = "wav"):
audio = AudioSegment.from_file(audio_path)
change_in_dBFS = target_db - audio.dBFS
normalized_audio = audio.apply_gain(change_in_dBFS)
normalized_audio.export(output_path, format=format)
rprint(f"[green]✅ Audio normalized from {audio.dBFS:.1f}dB to {target_db:.1f}dB[/green]")
return output_path
def convert_video_to_audio(video_file: str):
os.makedirs(_AUDIO_DIR, exist_ok=True)
if not os.path.exists(_RAW_AUDIO_FILE):
rprint(f"[blue]🎬➡️🎵 Converting to high quality audio with FFmpeg ......[/blue]")
if _ffmpeg_has_encoder('libmp3lame'):
cmd = [
'ffmpeg', '-y', '-i', video_file, '-vn',
'-c:a', 'libmp3lame', '-b:a', '32k',
'-ar', '16000', '-ac', '1',
'-metadata', 'encoding=UTF-8', _RAW_AUDIO_FILE
]
else:
# Fallback: conda-forge ffmpeg often lacks libmp3lame.
# Output as WAV (PCM) which all ffmpeg builds support.
# Downstream readers (pydub, librosa, whisperX) detect format by
# file header, not extension, so .mp3 path with WAV content works.
rprint("[yellow]⚠️ libmp3lame not found in ffmpeg, falling back to WAV (PCM) encoding[/yellow]")
cmd = [
'ffmpeg', '-y', '-i', video_file, '-vn',
'-c:a', 'pcm_s16le', '-ar', '16000', '-ac', '1',
'-f', 'wav', _RAW_AUDIO_FILE
]
subprocess.run(cmd, check=True, stderr=subprocess.PIPE)
rprint(f"[green]🎬➡️🎵 Converted <{video_file}> to <{_RAW_AUDIO_FILE}> with FFmpeg\n[/green]")
def get_audio_duration(audio_file: str) -> float:
"""Get the duration of an audio file using ffmpeg."""
cmd = ['ffmpeg', '-i', audio_file]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
_, stderr = process.communicate()
output = stderr.decode('utf-8', errors='ignore')
try:
duration_str = [line for line in output.split('\n') if 'Duration' in line][0]
duration_parts = duration_str.split('Duration: ')[1].split(',')[0].split(':')
duration = float(duration_parts[0])*3600 + float(duration_parts[1])*60 + float(duration_parts[2])
except Exception as e:
print(f"[red]❌ Error: Failed to get audio duration: {e}[/red]")
duration = 0
return duration
def split_audio(audio_file: str, target_len: float = 30*60, win: float = 60) -> List[Tuple[float, float]]:
## 在 [target_len-win, target_len+win] 区间内用 pydub 检测静默,切分音频
rprint(f"[blue]🎙️ Starting audio segmentation {audio_file} {target_len} {win}[/blue]")
audio = AudioSegment.from_file(audio_file)
duration = float(mediainfo(audio_file)["duration"])
if duration <= target_len + win:
return [(0, duration)]
segments, pos = [], 0.0
safe_margin = 0.5 # 静默点前后安全边界,单位秒
while pos < duration:
if duration - pos <= target_len:
segments.append((pos, duration)); break
threshold = pos + target_len
ws, we = int((threshold - win) * 1000), int((threshold + win) * 1000)
# 获取完整的静默区域
silence_regions = detect_silence(audio[ws:we], min_silence_len=int(safe_margin*1000), silence_thresh=-30)
silence_regions = [(s/1000 + (threshold - win), e/1000 + (threshold - win)) for s, e in silence_regions]
# 筛选长度足够(至少1秒)且位置适合的静默区域
valid_regions = [
(start, end) for start, end in silence_regions
if (end - start) >= (safe_margin * 2) and threshold <= start + safe_margin <= threshold + win
]
if valid_regions:
start, end = valid_regions[0]
split_at = start + safe_margin # 在静默区域起始点后0.5秒处切分
else:
rprint(f"[yellow]⚠️ No valid silence regions found for {audio_file} at {threshold}s, using threshold[/yellow]")
split_at = threshold
segments.append((pos, split_at)); pos = split_at
rprint(f"[green]🎙️ Audio split completed {len(segments)} segments[/green]")
return segments
def process_transcription(result: Dict) -> pd.DataFrame:
all_words = []
for segment in result['segments']:
# Get speaker_id, if not exists, set to None
speaker_id = segment.get('speaker_id', None)
for word in segment['words']:
# Check word length
if len(word["word"]) > 30:
rprint(f"[yellow]⚠️ Warning: Detected word longer than 30 characters, skipping: {word['word']}[/yellow]")
continue
# ! For French, we need to convert guillemets to empty strings
word["word"] = word["word"].replace('»', '').replace('«', '')
if 'start' not in word and 'end' not in word:
if all_words:
# Assign the end time of the previous word as the start and end time of the current word
word_dict = {
'text': word["word"],
'start': all_words[-1]['end'],
'end': all_words[-1]['end'],
'speaker_id': speaker_id
}
all_words.append(word_dict)
else:
# If it's the first word, look next for a timestamp then assign it to the current word
next_word = next((w for w in segment['words'] if 'start' in w and 'end' in w), None)
if next_word:
word_dict = {
'text': word["word"],
'start': next_word["start"],
'end': next_word["end"],
'speaker_id': speaker_id
}
all_words.append(word_dict)
else:
raise Exception(f"No next word with timestamp found for the current word : {word}")
else:
# Normal case, with start and end times
word_dict = {
'text': f'{word["word"]}',
'start': word.get('start', all_words[-1]['end'] if all_words else 0),
'end': word['end'],
'speaker_id': speaker_id
}
all_words.append(word_dict)
return pd.DataFrame(all_words)
def save_results(df: pd.DataFrame):
os.makedirs('output/log', exist_ok=True)
# Remove rows where 'text' is empty
initial_rows = len(df)
df = df[df['text'].str.len() > 0]
removed_rows = initial_rows - len(df)
if removed_rows > 0:
rprint(f"[blue]ℹ️ Removed {removed_rows} row(s) with empty text.[/blue]")
# Check for and remove words longer than 20 characters
long_words = df[df['text'].str.len() > 30]
if not long_words.empty:
rprint(f"[yellow]⚠️ Warning: Detected {len(long_words)} word(s) longer than 30 characters. These will be removed.[/yellow]")
df = df[df['text'].str.len() <= 30]
df['text'] = df['text'].apply(lambda x: f'"{x}"')
df.to_excel(_2_CLEANED_CHUNKS, index=False)
rprint(f"[green]📊 Excel file saved to {_2_CLEANED_CHUNKS}[/green]")
def save_language(language: str):
update_key("whisper.detected_language", language)
================================================
FILE: core/asr_backend/demucs_vl.py
================================================
import os
import torch
from rich.console import Console
from rich import print as rprint
from demucs.pretrained import get_model
from demucs.audio import save_audio
from torch.cuda import is_available as is_cuda_available
from typing import Optional
from demucs.api import Separator
from demucs.apply import BagOfModels
import gc
from core.utils.models import *
class PreloadedSeparator(Separator):
def __init__(self, model: BagOfModels, shifts: int = 1, overlap: float = 0.25,
split: bool = True, segment: Optional[int] = None, jobs: int = 0):
self._model, self._audio_channels, self._samplerate = model, model.audio_channels, model.samplerate
device = "cuda" if is_cuda_available() else "mps" if torch.backends.mps.is_available() else "cpu"
self.update_parameter(device=device, shifts=shifts, overlap=overlap, split=split,
segment=segment, jobs=jobs, progress=True, callback=None, callback_arg=None)
def demucs_audio():
if os.path.exists(_VOCAL_AUDIO_FILE) and os.path.exists(_BACKGROUND_AUDIO_FILE):
rprint(f"[yellow]⚠️ {_VOCAL_AUDIO_FILE} and {_BACKGROUND_AUDIO_FILE} already exist, skip Demucs processing.[/yellow]")
return
console = Console()
os.makedirs(_AUDIO_DIR, exist_ok=True)
console.print("🤖 Loading model...")
model = get_model('htdemucs')
separator = PreloadedSeparator(model=model, shifts=1, overlap=0.25)
console.print("🎵 Separating audio...")
_, outputs = separator.separate_audio_file(_RAW_AUDIO_FILE)
kwargs = {"samplerate": model.samplerate, "bitrate": 128, "preset": 2,
"clip": "rescale", "as_float": False, "bits_per_sample": 16}
console.print("🎤 Saving vocals track...")
save_audio(outputs['vocals'].cpu(), _VOCAL_AUDIO_FILE, **kwargs)
console.print("🎹 Saving background music...")
background = sum(audio for source, audio in outputs.items() if source != 'vocals')
save_audio(background.cpu(), _BACKGROUND_AUDIO_FILE, **kwargs)
# Clean up memory
del outputs, background, model, separator
gc.collect()
console.print("[green]✨ Audio separation completed![/green]")
if __name__ == "__main__":
demucs_audio()
================================================
FILE: core/asr_backend/elevenlabs_asr.py
================================================
import os
import json
import time
import requests
import tempfile
import librosa
import soundfile as sf
from rich import print as rprint
from core.utils import *
# ----------------------------------------
# ISO 639-2 to 1
# ----------------------------------------
iso_639_2_to_1 = {
"eng": "en",
"fra": "fr",
"deu": "de",
"ita": "it",
"spa": "es",
"rus": "ru",
"kor": "ko",
"jpn": "ja",
"zho": "zh",
"yue": "zh"
}
# ----------------------------
# elevenlabs format to whisper format
# ----------------------------
SPLIT_GAP = 1
def elev2whisper(elev_json, word_level_timestamp = False):
words = elev_json.get("words", [])
if not words:
return {"segments": []}
segments, seg = [], {
"text": "", # accumulated text
"start": words[0]["start"], # seg start time
"end": words[0]["end"], # seg end time (updates)
"speaker_id": words[0]["speaker_id"],
"words": [] # optional per‑word info
}
for prev, nxt in zip(words, words[1:] + [None]): # pairwise with sentinel
seg["text"] += prev["text"]
seg["end"] = prev["end"]
if word_level_timestamp:
seg["words"].append({"text": prev["text"], "start": prev["start"], "end": prev["end"]})
# decide whether to break the segment
if nxt is None or (nxt["start"] - prev["end"] > SPLIT_GAP) or (nxt["speaker_id"] != seg["speaker_id"]):
seg["text"] = seg["text"].strip()
if not word_level_timestamp:
seg.pop("words")
segments.append(seg)
if nxt is not None: # seed next segment
seg = {
"text": "",
"start": nxt["start"],
"end": nxt["end"],
"speaker_id": nxt["speaker_id"],
"words": []
}
return {"segments": segments}
def transcribe_audio_elevenlabs(raw_audio_path, vocal_audio_path, start = None, end = None):
rprint(f"[cyan]🎤 Processing audio transcription, file path: {vocal_audio_path}[/cyan]")
LOG_FILE = f"output/log/elevenlabs_transcribe_{start}_{end}.json"
if os.path.exists(LOG_FILE):
with open(LOG_FILE, "r", encoding="utf-8") as f:
return json.load(f)
# Load audio and process start/end parameters
y, sr = librosa.load(vocal_audio_path, sr=16000)
audio_duration = len(y) / sr
if start is None or end is None:
start = 0
end = audio_duration
# Slice audio based on start/end
start_sample = int(start * sr)
end_sample = int(end * sr)
y_slice = y[start_sample:end_sample]
# Create temporary file for the sliced audio
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_file:
temp_filepath = temp_file.name
sf.write(temp_filepath, y_slice, sr, format='MP3')
try:
api_key = load_key("whisper.elevenlabs_api_key")
base_url = "https://api.elevenlabs.io/v1/speech-to-text"
headers = {"xi-api-key": api_key}
data = {
"model_id": "scribe_v1",
"timestamps_granularity": "word",
"language_code": load_key("whisper.language"),
"diarize": True,
"num_speakers": None,
"tag_audio_events": False
}
with open(temp_filepath, 'rb') as audio_file:
files = {"file": (os.path.basename(temp_filepath), audio_file, 'audio/mpeg')}
start_time = time.time()
response = requests.post(base_url, headers=headers, data=data, files=files)
rprint(f"[yellow]API request sent, status code: {response.status_code}[/yellow]")
result = response.json()
# save detected language
detected_language = iso_639_2_to_1.get(result["language_code"], result["language_code"])
update_key("whisper.detected_language", detected_language)
# Adjust timestamps for all words by adding the start time
if start is not None and 'words' in result:
for word in result['words']:
if 'start' in word:
word['start'] += start
if 'end' in word:
word['end'] += start
rprint(f"[green]✓ Transcription completed in {time.time() - start_time:.2f} seconds[/green]")
parsed_result = elev2whisper(result)
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
with open(LOG_FILE, "w", encoding="utf-8") as f:
json.dump(parsed_result, f, indent=4, ensure_ascii=False)
return parsed_result
finally:
# Clean up the temporary file
if os.path.exists(temp_filepath):
os.remove(temp_filepath)
if __name__ == "__main__":
file_path = input("Enter local audio file path (mp3 format): ")
language = input("Enter language code for transcription (en or zh or other...): ")
result = transcribe_audio_elevenlabs(file_path, language_code=language)
print(result)
# Save result to file
with open("output/transcript.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=4)
================================================
FILE: core/asr_backend/whisperX_302.py
================================================
import os
import io
import json
import time
import requests
import librosa
import soundfile as sf
from rich import print as rprint
from core.utils import *
from core.utils.models import *
OUTPUT_LOG_DIR = "output/log"
def transcribe_audio_302(raw_audio_path: str, vocal_audio_path: str, start: float = None, end: float = None):
os.makedirs(OUTPUT_LOG_DIR, exist_ok=True)
LOG_FILE = f"{OUTPUT_LOG_DIR}/whisperx302_{start}_{end}.json"
if os.path.exists(LOG_FILE):
with open(LOG_FILE, "r", encoding="utf-8") as f:
return json.load(f)
WHISPER_LANGUAGE = load_key("whisper.language")
update_key("whisper.language", WHISPER_LANGUAGE)
url = "https://api.302.ai/302/whisperx"
y, sr = librosa.load(vocal_audio_path, sr=16000)
audio_duration = len(y) / sr
if start is None or end is None:
start = 0
end = audio_duration
start_sample = int(start * sr)
end_sample = int(end * sr)
y_slice = y[start_sample:end_sample]
audio_buffer = io.BytesIO()
sf.write(audio_buffer, y_slice, sr, format='WAV', subtype='PCM_16')
audio_buffer.seek(0)
files = [('audio_input', ('audio_slice.wav', audio_buffer, 'application/octet-stream'))]
payload = {"processing_type": "align", "language": WHISPER_LANGUAGE, "output": "raw"}
start_time = time.time()
rprint(f"[cyan]🎤 Transcribing audio with language: <{WHISPER_LANGUAGE}> ...[/cyan]")
headers = {'Authorization': f'Bearer {load_key("whisper.whisperX_302_api_key")}'}
response = requests.request("POST", url, headers=headers, data=payload, files=files)
response_json = response.json()
if start is not None:
for segment in response_json['segments']:
segment['start'] += start
segment['end'] += start
for word in segment.get('words', []):
if 'start' in word:
word['start'] += start
if 'end' in word:
word['end'] += start
with open(LOG_FILE, "w", encoding="utf-8") as f:
json.dump(response_json, f, indent=4, ensure_ascii=False)
elapsed_time = time.time() - start_time
rprint(f"[green]✓ Transcription completed in {elapsed_time:.2f} seconds[/green]")
return response_json
if __name__ == "__main__":
result = transcribe_audio_302(_RAW_AUDIO_FILE, _RAW_AUDIO_FILE)
rprint(result)
================================================
FILE: core/asr_backend/whisperX_local.py
================================================
import os
import warnings
import time
import subprocess
import torch
import functools
warnings.filterwarnings("ignore")
# =============================================================================
# Compatibility shim — applied BEFORE importing whisperx
# =============================================================================
# torch.load: default weights_only=False for pyannote checkpoints
# PyTorch >=2.6 changed torch.load default to weights_only=True.
# pyannote checkpoints contain omegaconf objects that fail the safety check.
# Monkey-patch torch.load to default to weights_only=False (matching <2.6
# behavior). This is safe here because all model files come from trusted
# sources (HuggingFace / pyannote).
_original_torch_load = torch.load
@functools.wraps(_original_torch_load)
def _patched_torch_load(*args, **kwargs):
if kwargs.get("weights_only") is None:
kwargs["weights_only"] = False
return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load
# =============================================================================
# Now safe to import whisperx and the rest of the application
# =============================================================================
import whisperx
from whisperx.audio import load_audio as _whisperx_load_audio, SAMPLE_RATE as _WHISPERX_SR
from rich import print as rprint
from core.utils import *
MODEL_DIR = load_key("model_dir")
@except_handler("failed to check hf mirror", default_return=None)
def check_hf_mirror():
mirrors = {'Official': 'huggingface.co', 'Mirror': 'hf-mirror.com'}
fastest_url = f"https://{mirrors['Official']}"
best_time = float('inf')
rprint("[cyan]🔍 Checking HuggingFace mirrors...[/cyan]")
for name, domain in mirrors.items():
if os.name == 'nt':
cmd = ['ping', '-n', '1', '-w', '3000', domain]
else:
cmd = ['ping', '-c', '1', '-W', '3', domain]
start = time.time()
result = subprocess.run(cmd, capture_output=True, text=True)
response_time = time.time() - start
if result.returncode == 0:
if response_time < best_time:
best_time = response_time
fastest_url = f"https://{domain}"
rprint(f"[green]✓ {name}:[/green] {response_time:.2f}s")
if best_time == float('inf'):
rprint("[yellow]⚠️ All mirrors failed, using default[/yellow]")
rprint(f"[cyan]🚀 Selected mirror:[/cyan] {fastest_url} ({best_time:.2f}s)")
return fastest_url
@except_handler("WhisperX processing error:")
def transcribe_audio(raw_audio_file, vocal_audio_file, start, end):
os.environ['HF_ENDPOINT'] = check_hf_mirror()
WHISPER_LANGUAGE = load_key("whisper.language")
device = "cuda" if torch.cuda.is_available() else "cpu"
rprint(f"🚀 Starting WhisperX using device: {device} ...")
if device == "cuda":
gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)
batch_size = 16 if gpu_mem > 8 else 2
compute_type = "float16" if torch.cuda.is_bf16_supported() else "int8"
rprint(f"[cyan]🎮 GPU memory:[/cyan] {gpu_mem:.2f} GB, [cyan]📦 Batch size:[/cyan] {batch_size}, [cyan]⚙️ Compute type:[/cyan] {compute_type}")
else:
batch_size = 1
compute_type = "int8"
rprint(f"[cyan]📦 Batch size:[/cyan] {batch_size}, [cyan]⚙️ Compute type:[/cyan] {compute_type}")
rprint(f"[green]▶️ Starting WhisperX for segment {start:.2f}s to {end:.2f}s...[/green]")
if WHISPER_LANGUAGE == 'zh':
model_name = "Huan69/Belle-whisper-large-v3-zh-punct-fasterwhisper"
local_model = os.path.join(MODEL_DIR, "Belle-whisper-large-v3-zh-punct-fasterwhisper")
else:
model_name = load_key("whisper.model")
local_model = os.path.join(MODEL_DIR, model_name)
if os.path.exists(local_model):
rprint(f"[green]📥 Loading local WHISPER model:[/green] {local_model} ...")
model_name = local_model
else:
rprint(f"[green]📥 Using WHISPER model from HuggingFace:[/green] {model_name} ...")
vad_options = {"vad_onset": 0.500,"vad_offset": 0.363}
asr_options = {"temperatures": [0],"initial_prompt": "",}
whisper_language = None if 'auto' in WHISPER_LANGUAGE else WHISPER_LANGUAGE
rprint("[bold yellow] You can ignore warning of `Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118...`[/bold yellow]")
model = whisperx.load_model(model_name, device, compute_type=compute_type, language=whisper_language, vad_options=vad_options, asr_options=asr_options, download_root=MODEL_DIR)
def load_audio_segment(audio_file, start, end):
# Use whisperx's ffmpeg-based loader instead of librosa.load() which
# deadlocks inside Streamlit's ScriptRunner thread.
full_audio = _whisperx_load_audio(audio_file, sr=_WHISPERX_SR)
start_sample = int(start * _WHISPERX_SR)
end_sample = int(end * _WHISPERX_SR)
return full_audio[start_sample:end_sample]
raw_audio_segment = load_audio_segment(raw_audio_file, start, end)
vocal_audio_segment = load_audio_segment(vocal_audio_file, start, end)
# -------------------------
# 1. transcribe raw audio
# -------------------------
transcribe_start_time = time.time()
rprint("[bold green]Note: You will see Progress if working correctly ↓[/bold green]")
result = model.transcribe(raw_audio_segment, batch_size=batch_size, print_progress=True)
transcribe_time = time.time() - transcribe_start_time
rprint(f"[cyan]⏱️ time transcribe:[/cyan] {transcribe_time:.2f}s")
# Free GPU resources
del model
torch.cuda.empty_cache()
# Save language
update_key("whisper.language", result['language'])
if result['language'] == 'zh' and WHISPER_LANGUAGE != 'zh':
raise ValueError("Please specify the transcription language as zh and try again!")
# -------------------------
# 2. align by vocal audio
# -------------------------
align_start_time = time.time()
# Align timestamps using vocal audio
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, vocal_audio_segment, device, return_char_alignments=False)
align_time = time.time() - align_start_time
rprint(f"[cyan]⏱️ time align:[/cyan] {align_time:.2f}s")
# Free GPU resources again
torch.cuda.empty_cache()
del model_a
# Adjust timestamps
for segment in result['segments']:
segment['start'] += start
segment['end'] += start
for word in segment['words']:
if 'start' in word:
word['start'] += start
if 'end' in word:
word['end'] += start
return result
================================================
FILE: core/prompts.py
================================================
import json
from core.utils import *
## ================================================================
# @ step4_splitbymeaning.py
def get_split_prompt(sentence, num_parts = 2, word_limit = 20):
language = load_key("whisper.detected_language")
split_prompt = f"""
## Role
You are a professional Netflix subtitle splitter in **{language}**.
## Task
Split the given subtitle text into **{num_parts}** parts, each less than **{word_limit}** words.
1. Maintain sentence meaning coherence according to Netflix subtitle standards
2. MOST IMPORTANT: Keep parts roughly equal in length (minimum 3 words each)
3. Split at natural points like punctuation marks or conjunctions
4. If provided text is repeated words, simply split at the middle of the repeated words.
## Steps
1. Analyze the sentence structure, complexity, and key splitting challenges
2. Generate two alternative splitting approaches with [br] tags at split positions
3. Compare both approaches highlighting their strengths and weaknesses
4. Choose the best splitting approach
## Given Text
{sentence}
## Output in only JSON format and no other text
```json
{{
"analysis": "Brief description of sentence structure, complexity, and key splitting challenges",
"split1": "First splitting approach with [br] tags at split positions",
"split2": "Alternative splitting approach with [br] tags at split positions",
"assess": "Comparison of both approaches highlighting their strengths and weaknesses",
"choice": "1 or 2"
}}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
""".strip()
return split_prompt
"""{{
"analysis": "Brief analysis of the text structure",
"split": "Complete sentence with [br] tags at split positions"
}}"""
## ================================================================
# @ step4_1_summarize.py
def get_summary_prompt(source_content, custom_terms_json=None):
src_lang = load_key("whisper.detected_language")
tgt_lang = load_key("target_language")
# add custom terms note
terms_note = ""
if custom_terms_json:
terms_list = []
for term in custom_terms_json['terms']:
terms_list.append(f"- {term['src']}: {term['tgt']} ({term['note']})")
terms_note = "\n### Existing Terms\nPlease exclude these terms in your extraction:\n" + "\n".join(terms_list)
summary_prompt = f"""
## Role
You are a video translation expert and terminology consultant, specializing in {src_lang} comprehension and {tgt_lang} expression optimization.
## Task
For the provided {src_lang} video text:
1. Summarize main topic in two sentences
2. Extract professional terms/names with {tgt_lang} translations (excluding existing terms)
3. Provide brief explanation for each term
{terms_note}
Steps:
1. Topic Summary:
- Quick scan for general understanding
- Write two sentences: first for main topic, second for key point
2. Term Extraction:
- Mark professional terms and names (excluding those listed in Existing Terms)
- Provide {tgt_lang} translation or keep original
- Add brief explanation
- Extract less than 15 terms
## INPUT
{source_content}
## Output in only JSON format and no other text
{{
"theme": "Two-sentence video summary",
"terms": [
{{
"src": "{src_lang} term",
"tgt": "{tgt_lang} translation or original",
"note": "Brief explanation"
}},
...
]
}}
## Example
{{
"theme": "本视频介绍人工智能在医疗领域的应用现状。重点展示了AI在医学影像诊断和药物研发中的突破性进展。",
"terms": [
{{
"src": "Machine Learning",
"tgt": "机器学习",
"note": "AI的核心技术,通过数据训练实现智能决策"
}},
{{
"src": "CNN",
"tgt": "CNN",
"note": "卷积神经网络,用于医学图像识别的深度学习模型"
}}
]
}}
Note: Start you answer with ```json and end with ```, do not add any other text.
""".strip()
return summary_prompt
## ================================================================
# @ step5_translate.py & translate_lines.py
def generate_shared_prompt(previous_content_prompt, after_content_prompt, summary_prompt, things_to_note_prompt):
return f'''### Context Information
{previous_content_prompt}
{after_content_prompt}
### Content Summary
{summary_prompt}
### Points to Note
{things_to_note_prompt}'''
def get_prompt_faithfulness(lines, shared_prompt):
TARGET_LANGUAGE = load_key("target_language")
# Split lines by \n
line_splits = lines.split('\n')
json_dict = {}
for i, line in enumerate(line_splits, 1):
json_dict[f"{i}"] = {"origin": line, "direct": f"direct {TARGET_LANGUAGE} translation {i}."}
json_format = json.dumps(json_dict, indent=2, ensure_ascii=False)
src_language = load_key("whisper.detected_language")
prompt_faithfulness = f'''
## Role
You are a professional Netflix subtitle translator, fluent in both {src_language} and {TARGET_LANGUAGE}, as well as their respective cultures.
Your expertise lies in accurately understanding the semantics and structure of the original {src_language} text and faithfully translating it into {TARGET_LANGUAGE} while preserving the original meaning.
## Task
We have a segment of original {src_language} subtitles that need to be directly translated into {TARGET_LANGUAGE}. These subtitles come from a specific context and may contain specific themes and terminology.
1. Translate the original {src_language} subtitles into {TARGET_LANGUAGE} line by line
2. Ensure the translation is faithful to the original, accurately conveying the original meaning
3. Consider the context and professional terminology
{shared_prompt}
1. Faithful to the original: Accurately convey the content and meaning of the original text, without arbitrarily changing, adding, or omitting content.
2. Accurate terminology: Use professional terms correctly and maintain consistency in terminology.
3. Understand the context: Fully comprehend and reflect the background and contextual relationships of the text.
## INPUT
{lines}
## Output in only JSON format and no other text
```json
{json_format}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
'''
return prompt_faithfulness.strip()
def get_prompt_expressiveness(faithfulness_result, lines, shared_prompt):
TARGET_LANGUAGE = load_key("target_language")
json_format = {
key: {
"origin": value["origin"],
"direct": value["direct"],
"reflect": "your reflection on direct translation",
"free": "your free translation"
}
for key, value in faithfulness_result.items()
}
json_format = json.dumps(json_format, indent=2, ensure_ascii=False)
src_language = load_key("whisper.detected_language")
prompt_expressiveness = f'''
## Role
You are a professional Netflix subtitle translator and language consultant.
Your expertise lies not only in accurately understanding the original {src_language} but also in optimizing the {TARGET_LANGUAGE} translation to better suit the target language's expression habits and cultural background.
## Task
We already have a direct translation version of the original {src_language} subtitles.
Your task is to reflect on and improve these direct translations to create more natural and fluent {TARGET_LANGUAGE} subtitles.
1. Analyze the direct translation results line by line, pointing out existing issues
2. Provide detailed modification suggestions
3. Perform free translation based on your analysis
4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read
5. Do not leave empty lines in the free translation, as the subtitles are for the audience to read
{shared_prompt}
Please use a two-step thinking process to handle the text line by line:
1. Direct Translation Reflection:
- Evaluate language fluency
- Check if the language style is consistent with the original text
- Check the conciseness of the subtitles, point out where the translation is too wordy
2. {TARGET_LANGUAGE} Free Translation:
- Aim for contextual smoothness and naturalness, conforming to {TARGET_LANGUAGE} expression habits
- Ensure it's easy for {TARGET_LANGUAGE} audience to understand and accept
- Adapt the language style to match the theme (e.g., use casual language for tutorials, professional terminology for technical content, formal language for documentaries)
## INPUT
{lines}
## Output in only JSON format and no other text
```json
{json_format}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
'''
return prompt_expressiveness.strip()
## ================================================================
# @ step6_splitforsub.py
def get_align_prompt(src_sub, tr_sub, src_part):
targ_lang = load_key("target_language")
src_lang = load_key("whisper.detected_language")
src_splits = src_part.split('\n')
num_parts = len(src_splits)
src_part = src_part.replace('\n', ' [br] ')
align_parts_json = ','.join(
f'''
{{
"src_part_{i+1}": "{src_splits[i]}",
"target_part_{i+1}": "Corresponding aligned {targ_lang} subtitle part"
}}''' for i in range(num_parts)
)
align_prompt = f'''
## Role
You are a Netflix subtitle alignment expert fluent in both {src_lang} and {targ_lang}.
## Task
We have {src_lang} and {targ_lang} original subtitles for a Netflix program, as well as a pre-processed split version of {src_lang} subtitles.
Your task is to create the best splitting scheme for the {targ_lang} subtitles based on this information.
1. Analyze the word order and structural correspondence between {src_lang} and {targ_lang} subtitles
2. Split the {targ_lang} subtitles according to the pre-processed {src_lang} split version
3. Never leave empty lines. If it's difficult to split based on meaning, you may appropriately rewrite the sentences that need to be aligned
4. Do not add comments or explanations in the translation, as the subtitles are for the audience to read
## INPUT
{src_lang} Original: "{src_sub}"
{targ_lang} Original: "{tr_sub}"
Pre-processed {src_lang} Subtitles ([br] indicates split points): {src_part}
## Output in only JSON format and no other text
```json
{{
"analysis": "Brief analysis of word order, structure, and semantic correspondence between two subtitles",
"align": [
{align_parts_json}
]
}}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
'''.strip()
return align_prompt
## ================================================================
# @ step8_gen_audio_task.py @ step10_gen_audio.py
def get_subtitle_trim_prompt(text, duration):
rule = '''Consider a. Reducing filler words without modifying meaningful content. b. Omitting unnecessary modifiers or pronouns, for example:
- "Please explain your thought process" can be shortened to "Please explain thought process"
- "We need to carefully analyze this complex problem" can be shortened to "We need to analyze this problem"
- "Let's discuss the various different perspectives on this topic" can be shortened to "Let's discuss different perspectives on this topic"
- "Can you describe in detail your experience from yesterday" can be shortened to "Can you describe yesterday's experience" '''
trim_prompt = f'''
## Role
You are a professional subtitle editor, editing and optimizing lengthy subtitles that exceed voiceover time before handing them to voice actors.
Your expertise lies in cleverly shortening subtitles slightly while ensuring the original meaning and structure remain unchanged.
## INPUT
Subtitle: "{text}"
Duration: {duration} seconds
## Processing Rules
{rule}
## Processing Steps
Please follow these steps and provide the results in the JSON output:
1. Analysis: Briefly analyze the subtitle's structure, key information, and filler words that can be omitted.
2. Trimming: Based on the rules and analysis, optimize the subtitle by making it more concise according to the processing rules.
## Output in only JSON format and no other text
```json
{{
"analysis": "Brief analysis of the subtitle, including structure, key information, and potential processing locations",
"result": "Optimized and shortened subtitle in the original subtitle language"
}}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
'''.strip()
return trim_prompt
## ================================================================
# @ tts_main
def get_correct_text_prompt(text):
return f'''
## Role
You are a text cleaning expert for TTS (Text-to-Speech) systems.
## Task
Clean the given text by:
1. Keep only basic punctuation (.,?!)
2. Preserve the original meaning
## INPUT
{text}
## Output in only JSON format and no other text
```json
{{
"text": "cleaned text here"
}}
```
Note: Start you answer with ```json and end with ```, do not add any other text.
'''.strip()
================================================
FILE: core/spacy_utils/__init__.py
================================================
from .split_by_comma import split_by_comma_main
from .split_by_connector import split_sentences_main
from .split_by_mark import split_by_mark
from .split_long_by_root import split_long_by_root_main
from .load_nlp_model import init_nlp
__all__ = [
"split_by_comma_main",
"split_sentences_main",
"split_by_mark",
"split_long_by_root_main",
"init_nlp"
]
================================================
FILE: core/spacy_utils/load_nlp_model.py
================================================
import spacy
from spacy.cli import download
from core.utils import rprint, load_key, except_handler
SPACY_MODEL_MAP = load_key("spacy_model_map")
def get_spacy_model(language: str):
model = SPACY_MODEL_MAP.get(language.lower(), "en_core_web_md")
if language not in SPACY_MODEL_MAP:
rprint(f"[yellow]Spacy model does not support '{language}', using en_core_web_md model as fallback...[/yellow]")
return model
@except_handler("Failed to load NLP Spacy model")
def init_nlp():
language = "en" if load_key("whisper.language") == "en" else load_key("whisper.detected_language")
model = get_spacy_model(language)
rprint(f"[blue]⏳ Loading NLP Spacy model: <{model}> ...[/blue]")
try:
nlp = spacy.load(model)
except:
rprint(f"[yellow]Downloading {model} model...[/yellow]")
rprint("[yellow]If download failed, please check your network and try again.[/yellow]")
download(model)
nlp = spacy.load(model)
rprint("[green]✅ NLP Spacy model loaded successfully![/green]")
return nlp
# --------------------
# define the intermediate files
# --------------------
SPLIT_BY_COMMA_FILE = "output/log/split_by_comma.txt"
SPLIT_BY_CONNECTOR_FILE = "output/log/split_by_connector.txt"
SPLIT_BY_MARK_FILE = "output/log/split_by_mark.txt"
================================================
FILE: core/spacy_utils/split_by_comma.py
================================================
import itertools
import os
import warnings
from core.utils import *
from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_COMMA_FILE, SPLIT_BY_MARK_FILE
warnings.filterwarnings("ignore", category=FutureWarning)
def is_valid_phrase(phrase):
# 🔍 Check for subject and verb
has_subject = any(token.dep_ in ["nsubj", "nsubjpass"] or token.pos_ == "PRON" for token in phrase)
has_verb = any((token.pos_ == "VERB" or token.pos_ == 'AUX') for token in phrase)
return (has_subject and has_verb)
def analyze_comma(start, doc, token):
left_phrase = doc[max(start, token.i - 9):token.i]
right_phrase = doc[token.i + 1:min(len(doc), token.i + 10)]
suitable_for_splitting = is_valid_phrase(right_phrase) # and is_valid_phrase(left_phrase) # ! no need to chekc left phrase
# 🚫 Remove punctuation and check word count
left_words = [t for t in left_phrase if not t.is_punct]
right_words = list(itertools.takewhile(lambda t: not t.is_punct, right_phrase)) # ! only check the first part of the right phrase
if len(left_words) <= 3 or len(right_words) <= 3:
suitable_for_splitting = False
return suitable_for_splitting
def split_by_comma(text, nlp):
doc = nlp(text)
sentences = []
start = 0
for i, token in enumerate(doc):
if token.text == "," or token.text == ",":
suitable_for_splitting = analyze_comma(start, doc, token)
if suitable_for_splitting:
sentences.append(doc[start:token.i].text.strip())
rprint(f"[yellow]✂️ Split at comma: {doc[start:token.i][-4:]},| {doc[token.i + 1:][:4]}[/yellow]")
start = token.i + 1
sentences.append(doc[start:].text.strip())
return sentences
def split_by_comma_main(nlp):
with open(SPLIT_BY_MARK_FILE, "r", encoding="utf-8") as input_file:
sentences = input_file.readlines()
all_split_sentences = []
for sentence in sentences:
split_sentences = split_by_comma(sentence.strip(), nlp)
all_split_sentences.extend(split_sentences)
with open(SPLIT_BY_COMMA_FILE, "w", encoding="utf-8") as output_file:
for sentence in all_split_sentences:
output_file.write(sentence + "\n")
# delete the original file
os.remove(SPLIT_BY_MARK_FILE)
rprint(f"[green]💾 Sentences split by commas saved to → `{SPLIT_BY_COMMA_FILE}`[/green]")
if __name__ == "__main__":
nlp = init_nlp()
split_by_comma_main(nlp)
# nlp = init_nlp()
# test = "So in the same frame, right there, almost in the exact same spot on the ice, Brown has committed himself, whereas McDavid has not."
# print(split_by_comma(test, nlp))
================================================
FILE: core/spacy_utils/split_by_connector.py
================================================
import os
import warnings
from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_COMMA_FILE, SPLIT_BY_CONNECTOR_FILE
from core.utils import rprint
warnings.filterwarnings("ignore", category=FutureWarning)
def analyze_connectors(doc, token):
"""
Analyze whether a token is a connector that should trigger a sentence split.
Processing logic and order:
1. Check if the token is one of the target connectors based on the language.
2. For 'that' (English), check if it's part of a contraction (e.g., that's, that'll).
3. For all connectors, check if they function as a specific dependency of a verb or noun.
4. Default to splitting for certain connectors if no other conditions are met.
5. For coordinating conjunctions, check if they connect two independent clauses.
"""
lang = doc.lang_
if lang == "en":
connectors = ["that", "which", "where", "when", "because", "but", "and", "or"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "zh":
connectors = ["因为", "所以", "但是", "而且", "虽然", "如果", "即使", "尽管"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "ja":
connectors = ["けれども", "しかし", "だから", "それで", "ので", "のに", "ため"]
mark_dep = "mark"
det_pron_deps = ["case"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "fr":
connectors = ["que", "qui", "où", "quand", "parce que", "mais", "et", "ou"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "ru":
connectors = ["что", "который", "где", "когда", "потому что", "но", "и", "или"]
mark_dep = "mark"
det_pron_deps = ["det"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "es":
connectors = ["que", "cual", "donde", "cuando", "porque", "pero", "y", "o"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "de":
connectors = ["dass", "welche", "wo", "wann", "weil", "aber", "und", "oder"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
elif lang == "it":
connectors = ["che", "quale", "dove", "quando", "perché", "ma", "e", "o"]
mark_dep = "mark"
det_pron_deps = ["det", "pron"]
verb_pos = "VERB"
noun_pos = ["NOUN", "PROPN"]
else:
return False, False
if token.text.lower() not in connectors:
return False, False
if lang == "en" and token.text.lower() == "that":
if token.dep_ == mark_dep and token.head.pos_ == verb_pos:
return True, False
else:
return False, False
elif token.dep_ in det_pron_deps and token.head.pos_ in noun_pos:
return False, False
else:
return True, False
def split_by_connectors(text, context_words=5, nlp=None):
doc = nlp(text)
sentences = [doc.text] # init
while True:
# Handle each task with a single cut
# avoiding the fragmentation of a sentence into multiple parts at the same time.
split_occurred = False
new_sentences = []
for sent in sentences:
doc = nlp(sent)
start = 0
for i, token in enumerate(doc):
split_before, _ = analyze_connectors(doc, token)
if i + 1 < len(doc) and doc[i + 1].text in ["'s", "'re", "'ve", "'ll", "'d"]:
continue
left_words = doc[max(0, token.i - context_words):token.i]
right_words = doc[token.i+1:min(len(doc), token.i + context_words + 1)]
left_words = [word.text for word in left_words if not word.is_punct]
right_words = [word.text for word in right_words if not word.is_punct]
if len(left_words) >= context_words and len(right_words) >= context_words and split_before:
rprint(f"[yellow]✂️ Split before '{token.text}': {' '.join(left_words)}| {token.text} {' '.join(right_words)}[/yellow]")
new_sentences.append(doc[start:token.i].text.strip())
start = token.i
split_occurred = True
break
if start < len(doc):
new_sentences.append(doc[start:].text.strip())
if not split_occurred:
break
sentences = new_sentences
return sentences
def split_sentences_main(nlp):
# Read input sentences
with open(SPLIT_BY_COMMA_FILE, "r", encoding="utf-8") as input_file:
sentences = input_file.readlines()
all_split_sentences = []
# Process each input sentence
for sentence in sentences:
split_sentences = split_by_connectors(sentence.strip(), nlp = nlp)
all_split_sentences.extend(split_sentences)
with open(SPLIT_BY_CONNECTOR_FILE, "w+", encoding="utf-8") as output_file:
for sentence in all_split_sentences:
output_file.write(sentence + "\n")
# do not add a newline at the end of the file
output_file.seek(output_file.tell() - 1, os.SEEK_SET)
output_file.truncate()
# delete the original file
os.remove(SPLIT_BY_COMMA_FILE)
rprint(f"[green]💾 Sentences split by connectors saved to → `{SPLIT_BY_CONNECTOR_FILE}`[/green]")
if __name__ == "__main__":
nlp = init_nlp()
split_sentences_main(nlp)
# nlp = init_nlp()
# a = "and show the specific differences that make a difference between a breakaway that results in a goal in the NHL versus one that doesn't."
# print(split_by_connectors(a, nlp))
================================================
FILE: core/spacy_utils/split_by_mark.py
================================================
import os
import pandas as pd
import warnings
from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_MARK_FILE
from core.utils.config_utils import load_key, get_joiner
from rich import print as rprint
warnings.filterwarnings("ignore", category=FutureWarning)
def split_by_mark(nlp):
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case
joiner = get_joiner(language)
rprint(f"[blue]🔍 Using {language} language joiner: '{joiner}'[/blue]")
chunks = pd.read_excel("output/log/cleaned_chunks.xlsx")
chunks.text = chunks.text.apply(lambda x: x.strip('"').strip(""))
# join with joiner
input_text = joiner.join(chunks.text.to_list())
doc = nlp(input_text)
assert doc.has_annotation("SENT_START")
# skip - and ...
sentences_by_mark = []
current_sentence = []
# iterate all sentences
for sent in doc.sents:
text = sent.text.strip()
# check if the current sentence ends with - or ...
if current_sentence and (
text.startswith('-') or
text.startswith('...') or
current_sentence[-1].endswith('-') or
current_sentence[-1].endswith('...')
):
current_sentence.append(text)
else:
if current_sentence:
sentences_by_mark.append(' '.join(current_sentence))
current_sentence = []
current_sentence.append(text)
# add the last sentence
if current_sentence:
sentences_by_mark.append(' '.join(current_sentence))
with open(SPLIT_BY_MARK_FILE, "w", encoding="utf-8") as output_file:
for i, sentence in enumerate(sentences_by_mark):
if i > 0 and sentence.strip() in [',', '.', ',', '。', '?', '!']:
# ! If the current line contains only punctuation, merge it with the previous line, this happens in Chinese, Japanese, etc.
output_file.seek(output_file.tell() - 1, os.SEEK_SET) # Move to the end of the previous line
output_file.write(sentence) # Add the punctuation
else:
output_file.write(sentence + "\n")
rprint(f"[green]💾 Sentences split by punctuation marks saved to → `{SPLIT_BY_MARK_FILE}`[/green]")
if __name__ == "__main__":
nlp = init_nlp()
split_by_mark(nlp)
================================================
FILE: core/spacy_utils/split_long_by_root.py
================================================
import os
import string
import warnings
from core.spacy_utils.load_nlp_model import init_nlp, SPLIT_BY_CONNECTOR_FILE
from core.utils import *
from core.utils.models import _3_1_SPLIT_BY_NLP
warnings.filterwarnings("ignore", category=FutureWarning)
def split_long_sentence(doc):
tokens = [token.text for token in doc]
n = len(tokens)
# dynamic programming array, dp[i] represents the optimal split scheme from the start to the ith token
dp = [float('inf')] * (n + 1)
dp[0] = 0
# record optimal split points
prev = [0] * (n + 1)
for i in range(1, n + 1):
for j in range(max(0, i - 100), i): # limit search range to avoid overly long sentences
if i - j >= 30: # ensure sentence length is at least 30
token = doc[i-1]
if j == 0 or (token.is_sent_end or token.pos_ in ['VERB', 'AUX'] or token.dep_ == 'ROOT'):
if dp[j] + 1 < dp[i]:
dp[i] = dp[j] + 1
prev[i] = j
# rebuild sentences based on optimal split points
sentences = []
i = n
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case
joiner = get_joiner(language)
while i > 0:
j = prev[i]
sentences.append(joiner.join(tokens[j:i]).strip())
i = j
return sentences[::-1] # reverse list to keep original order
def split_extremely_long_sentence(doc):
tokens = [token.text for token in doc]
n = len(tokens)
num_parts = (n + 59) // 60 # round up
part_length = n // num_parts
sentences = []
whisper_language = load_key("whisper.language")
language = load_key("whisper.detected_language") if whisper_language == 'auto' else whisper_language # consider force english case
joiner = get_joiner(language)
for i in range(num_parts):
start = i * part_length
end = start + part_length if i < num_parts - 1 else n
sentence = joiner.join(tokens[start:end])
sentences.append(sentence)
return sentences
def split_long_by_root_main(nlp):
with open(SPLIT_BY_CONNECTOR_FILE, "r", encoding="utf-8") as input_file:
sentences = input_file.readlines()
all_split_sentences = []
for sentence in sentences:
doc = nlp(sentence.strip())
if len(doc) > 60:
split_sentences = split_long_sentence(doc)
if any(len(nlp(sent)) > 60 for sent in split_sentences):
split_sentences = [subsent for sent in split_sentences for subsent in split_extremely_long_sentence(nlp(sent))]
all_split_sentences.extend(split_sentences)
rprint(f"[yellow]✂️ Splitting long sentences by root: {sentence[:30]}...[/yellow]")
else:
all_split_sentences.append(sentence.strip())
punctuation = string.punctuation + "'" + '"' # include all punctuation and apostrophe ' and "
with open(_3_1_SPLIT_BY_NLP, "w", encoding="utf-8") as output_file:
for i, sentence in enumerate(all_split_sentences):
stripped_sentence = sentence.strip()
if not stripped_sentence or all(char in punctuation for char in stripped_sentence):
rprint(f"[yellow]⚠️ Warning: Empty or punctuation-only line detected at index {i}[/yellow]")
if i > 0:
all_split_sentences[i-1] += sentence
continue
output_file.write(sentence + "\n")
# delete the original file
os.remove(SPLIT_BY_CONNECTOR_FILE)
rprint(f"[green]💾 Long sentences split by root saved to → {_3_1_SPLIT_BY_NLP}[/green]")
if __name__ == "__main__":
nlp = init_nlp()
split_long_by_root_main(nlp)
# raw = "平口さんの盛り上げごまが初めて売れました本当に嬉しいです本当にやっぱり見た瞬間いいって言ってくれるそういうコマを作るのがやっぱりいいですよねその2ヶ月後チコさんが何やらそわそわしていましたなんか気持ち悪いやってきたのは平口さんの駒の評判を聞きつけた愛知県の収集家ですこの男性師匠大沢さんの駒も持っているといいますちょっと褒めすぎかなでも確実にファンは広がっているようです自信がない部分をすごく感じてたのでこれで自信を持って進んでくれるなっていう本当に始まったばっかりこれからいろいろ挑戦していってくれるといいなと思って今月平口さんはある場所を訪れましたこれまで数々のタイトル戦でコマを提供してきた老舗5番手平口さんのコマを扱いたいと言いますいいですねぇ困ってだんだん成長しますので大切に使ってそういう長く良い駒になる駒ですね商談が終わった後店主があるものを取り出しましたこの前の名人戦で使った駒があるんですけど去年、名人銭で使われた盛り上げごま低く盛り上げて品良くするというのは難しい素晴らしいですね平口さんが目指す高みですこういった感じで作れればまだまだですけどただ、多分、咲く。"
# nlp = init_nlp()
# doc = nlp(raw.strip())
# for sent in split_still_long_sentence(doc):
# print(sent, '\n==========')
================================================
FILE: core/st_utils/__init__.py
================================================
================================================
FILE: core/st_utils/download_video_section.py
================================================
import os
import re
import shutil
import subprocess
from time import sleep
import streamlit as st
from core._1_ytdlp import download_video_ytdlp, find_video_files
from core.utils import *
from translations.translations import translate as t
OUTPUT_DIR = "output"
def download_video_section():
st.header(t("a. Download or Upload Video"))
with st.container(border=True):
try:
video_file = find_video_files()
st.video(video_file)
if st.button(t("Delete and Reselect"), key="delete_video_button"):
os.remove(video_file)
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
sleep(1)
st.rerun()
return True
except:
col1, col2 = st.columns([3, 1])
with col1:
url = st.text_input(t("Enter YouTube link:"))
with col2:
res_dict = {
"360p": "360",
"1080p": "1080",
"Best": "best"
}
target_res = load_key("ytb_resolution")
res_options = list(res_dict.keys())
default_idx = list(res_dict.values()).index(target_res) if target_res in res_dict.values() else 0
res_display = st.selectbox(t("Resolution"), options=res_options, index=default_idx)
res = res_dict[res_display]
if st.button(t("Download Video"), key="download_button", width="stretch"):
if url:
with st.spinner("Downloading video..."):
download_video_ytdlp(url, resolution=res)
st.rerun()
uploaded_file = st.file_uploader(t("Or upload video"), type=load_key("allowed_video_formats") + load_key("allowed_audio_formats"))
if uploaded_file:
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)
raw_name = uploaded_file.name.replace(' ', '_')
name, ext = os.path.splitext(raw_name)
clean_name = re.sub(r'[^\w\-_\.]', '', name) + ext.lower()
with open(os.path.join(OUTPUT_DIR, clean_name), "wb") as f:
f.write(uploaded_file.getbuffer())
if ext.lower() in load_key("allowed_audio_formats"):
convert_audio_to_video(os.path.join(OUTPUT_DIR, clean_name))
st.rerun()
else:
return False
def convert_audio_to_video(audio_file: str) -> str:
output_video = os.path.join(OUTPUT_DIR, 'black_screen.mp4')
if not os.path.exists(output_video):
print(f"🎵➡️🎬 Converting audio to video with FFmpeg ......")
ffmpeg_cmd = ['ffmpeg', '-y', '-f', 'lavfi', '-i', 'color=c=black:s=640x360', '-i', audio_file, '-shortest', '-c:v', 'libx264', '-c:a', 'aac', '-pix_fmt', 'yuv420p', output_video]
subprocess.run(ffmpeg_cmd, check=True, capture_output=True, text=True, encoding='utf-8')
print(f"🎵➡️🎬 Converted <{audio_file}> to <{output_video}> with FFmpeg\n")
# delete audio file
os.remove(audio_file)
return output_video
================================================
FILE: core/st_utils/imports_and_utils.py
================================================
import os
import streamlit as st
import io, zipfile
from core.st_utils.download_video_section import download_video_section
from core.st_utils.sidebar_setting import page_setting
from translations.translations import translate as t
def download_subtitle_zip_button(text: str):
zip_buffer = io.BytesIO()
output_dir = "output"
with zipfile.ZipFile(zip_buffer, "w") as zip_file:
for file_name in os.listdir(output_dir):
if file_name.endswith(".srt"):
file_path = os.path.join(output_dir, file_name)
with open(file_path, "rb") as file:
zip_file.writestr(file_name, file.read())
zip_buffer.seek(0)
st.download_button(
label=text,
data=zip_buffer,
file_name="subtitles.zip",
mime="application/zip"
)
# st.markdown
give_star_button = """
### Language Support
**Input Language Support(more to come):**
🇺🇸 English 🤩 | 🇷🇺 Russian 😊 | 🇫🇷 French 🤩 | 🇩🇪 German 🤩 | 🇮🇹 Italian 🤩 | 🇪🇸 Spanish 🤩 | 🇯🇵 Japanese 😐 | 🇨🇳 Chinese* 😊
> *Chinese uses a separate punctuation-enhanced whisper model, for now...
**Translation supports all languages, while dubbing language depends on the chosen TTS method.**
## Installation
> **Note:** To use NVIDIA GPU acceleration on Windows, please complete the following steps first:
> 1. Install [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe) or newer (12.8 / 13.x all work — the install script auto-adapts)
> 2. Install [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. Add `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` to your system PATH
> 4. Restart your computer
> **Note:** For Windows and macOS users, it's recommended to install FFmpeg via package managers (Chocolatey/Homebrew):
> ```choco install ffmpeg``` (Windows) or ```brew install ffmpeg``` (macOS).
> ⚠️ Do NOT use conda-forge ffmpeg (lacks libmp3lame encoder). Use the system package manager to install a full build.
1. Clone the repository
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. Install dependencies(requires `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. Start the application
```bash
streamlit run st.py
```
### Docker
Alternatively, you can use Docker (requires CUDA 12.4 and NVIDIA Driver version >550), see [Docker docs](/docs/pages/docs/docker.en-US.md):
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## API
The project supports OpenAI-Like API format and various dubbing interfaces:
- `claude-sonnet-4-6`, `gpt-5.2`, `gemini-3-flash`, `deepseek-v3`, `minimax-m2.5`, `kimi-k2.5`, ... (sorted by performance)
- `azure-tts`, `openai-tts`, `siliconflow-fishtts`, `fish-tts`, `GPT-SoVITS`
For detailed installation, API configuration, and batch mode instructions, please refer to the documentation: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md)
## Current Limitations
1. WhisperX transcription performance may be affected by video background noise, as it uses wav2vac model for alignment. For videos with loud background music, please enable Voice Separation Enhancement. Additionally, subtitles ending with numbers or special characters may be truncated early due to wav2vac's inability to map numeric characters (e.g., "1") to their spoken form ("one").
2. Using weaker models can lead to errors during intermediate processes due to strict JSON format requirements for responses. If this error occurs, please delete the `output` folder and retry with a different LLM, otherwise repeated execution will read the previous erroneous response causing the same error.
3. The dubbing feature may not be 100% perfect due to differences in speech rates and intonation between languages, as well as the impact of the translation step. However, this project has implemented extensive engineering processing for speech rates to ensure the best possible dubbing results.
4. **Multilingual video transcription recognition will only retain the main language**. This is because whisperX uses a specialized model for a single language when forcibly aligning word-level subtitles, and will delete unrecognized languages.
5. **Cannot dub multiple characters separately**, as whisperX's speaker distinction capability is not sufficiently reliable.
## 📄 License
This project is licensed under the Apache 2.0 License. Special thanks to the following open source projects for their contributions:
[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
## 📬 Contact Us
- Join our Discord: https://discord.gg/9F2G92CWPp
- Submit [Issues](https://github.com/Huanshere/VideoLingo/issues) or [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) on GitHub
- Follow me on Twitter: [@Huanshere](https://twitter.com/Huanshere)
- Email me at: team@videolingo.io
## ⭐ Star History
[](https://star-history.com/#Huanshere/VideoLingo&Timeline)
---
If you find VideoLingo helpful, please give us a ⭐️!
1. {t("WhisperX word-level transcription")}
2. {t("Sentence segmentation using NLP and LLM")}
3. {t("Summarization and multi-step translation")}
4. {t("Cutting and aligning long subtitles")}
5. {t("Generating timeline and subtitles")}
6. {t("Merging subtitles into the video")}
""", unsafe_allow_html=True)
if not os.path.exists(SUB_VIDEO):
if st.button(t("Start Processing Subtitles"), key="text_processing_button"):
process_text()
st.rerun()
else:
if load_key("burn_subtitles"):
st.video(SUB_VIDEO)
download_subtitle_zip_button(text=t("Download All Srt Files"))
if st.button(t("Archive to 'history'"), key="cleanup_in_text_processing"):
cleanup()
st.rerun()
return True
def process_text():
with st.spinner(t("Using Whisper for transcription...")):
_2_asr.transcribe()
with st.spinner(t("Splitting long sentences...")):
_3_1_split_nlp.split_by_spacy()
_3_2_split_meaning.split_sentences_by_meaning()
with st.spinner(t("Summarizing and translating...")):
_4_1_summarize.get_summary()
if load_key("pause_before_translate"):
input(t("⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue..."))
_4_2_translate.translate_all()
with st.spinner(t("Processing and aligning subtitles...")):
_5_split_sub.split_for_sub_main()
_6_gen_sub.align_timestamp_main()
with st.spinner(t("Merging subtitles to video...")):
_7_sub_into_vid.merge_subtitles_to_video()
st.success(t("Subtitle processing complete! 🎉"))
st.balloons()
def audio_processing_section():
st.header(t("c. Dubbing"))
with st.container(border=True):
st.markdown(f"""
{t("This stage includes the following steps:")}
1. {t("Generate audio tasks and chunks")}
2. {t("Extract reference audio")}
3. {t("Generate and merge audio files")}
4. {t("Merge final audio into video")}
""", unsafe_allow_html=True)
if not os.path.exists(DUB_VIDEO):
if st.button(t("Start Audio Processing"), key="audio_processing_button"):
process_audio()
st.rerun()
else:
st.success(t("Audio processing is complete! You can check the audio files in the `output` folder."))
if load_key("burn_subtitles"):
st.video(DUB_VIDEO)
if st.button(t("Delete dubbing files"), key="delete_dubbing_files"):
delete_dubbing_files()
st.rerun()
if st.button(t("Archive to 'history'"), key="cleanup_in_audio_processing"):
cleanup()
st.rerun()
def process_audio():
with st.spinner(t("Generate audio tasks")):
_8_1_audio_task.gen_audio_task_main()
_8_2_dub_chunks.gen_dub_chunks()
with st.spinner(t("Extract refer audio")):
_9_refer_audio.extract_refer_audio_main()
with st.spinner(t("Generate all audio")):
_10_gen_audio.gen_audio()
with st.spinner(t("Merge full audio")):
_11_merge_audio.merge_full_audio()
with st.spinner(t("Merge dubbing to the video")):
_12_dub_to_vid.merge_video_audio()
st.success(t("Audio processing complete! 🎇"))
st.balloons()
def main():
logo_col, _ = st.columns([1,1])
with logo_col:
st.image("docs/logo.png", width="stretch")
st.markdown(button_style, unsafe_allow_html=True)
welcome_text = t("Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!")
st.markdown(f"
# Conectando el Mundo, Cuadro por Cuadro
[**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Descripción General ([¡Prueba VL Gratis!](https://videolingo.io))
VideoLingo es una herramienta todo en uno para traducción, localización y doblaje de videos, diseñada para generar subtítulos de calidad Netflix. Elimina las traducciones mecánicas y los subtítulos de múltiples líneas mientras agrega doblaje de alta calidad, permitiendo compartir conocimiento globalmente a través de las barreras del idioma.
Características principales:
- 🎥 Descarga de videos de YouTube mediante yt-dlp
- **🎙️ Reconocimiento de subtítulos a nivel de palabra y baja ilusión con WhisperX**
- **📝 Segmentación de subtítulos impulsada por NLP e IA**
- **📚 Terminología personalizada + generada por IA para una traducción coherente**
- **🔄 Proceso de 3 pasos Traducción-Reflexión-Adaptación para calidad cinematográfica**
- **✅ Solo subtítulos de una línea, estándar Netflix**
- **🗣️ Doblaje con GPT-SoVITS, Azure, OpenAI y más**
- 🚀 Inicio y procesamiento con un clic en Streamlit
- 🌍 Soporte multilingüe en la interfaz de Streamlit
- 📝 Registro detallado con reanudación de progreso
Diferencia con proyectos similares: **Solo subtítulos de una línea, calidad superior de traducción, experiencia de doblaje perfecta**
## 🎥 Demo
### Clonación de Voz Cosy2
---
https://github.com/user-attachments/assets/e065fe4c-3694-477f-b4d6-316917df7c0a
### GPT-SoVITS con mi voz
---
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Soporte de Idiomas
**Soporte de idiomas de entrada (más por venir):**
🇺🇸 Inglés 🤩 | 🇷🇺 Ruso 😊 | 🇫🇷 Francés 🤩 | 🇩🇪 Alemán 🤩 | 🇮🇹 Italiano 🤩 | 🇪🇸 Español 🤩 | 🇯🇵 Japonés 😐 | 🇨🇳 Chino* 😊
> *El chino utiliza un modelo whisper mejorado con puntuación por ahora...
**La traducción admite todos los idiomas, mientras que el idioma del doblaje depende del método TTS elegido.**
## Instalación
¿Tienes algún problema? Chatea con nuestro agente de IA en línea gratuito [**aquí**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) para ayudarte.
> **Nota:** Para usuarios de Windows con GPU NVIDIA, sigue estos pasos antes de la instalación:
> 1. Instala [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
> 2. Instala [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. Agrega `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` a tu PATH del sistema
> 4. Reinicia tu computadora
> **Nota:** Se requiere FFmpeg. Por favor, instálalo a través de gestores de paquetes:
> - Windows: ```choco install ffmpeg``` (vía [Chocolatey](https://chocolatey.org/))
> - macOS: ```brew install ffmpeg``` (vía [Homebrew](https://brew.sh/))
> - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu)
1. Clona el repositorio
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. Instala las dependencias (requiere `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. Inicia la aplicación
```bash
streamlit run st.py
```
### Docker
Alternativamente, puedes usar Docker (requiere CUDA 12.4 y versión del controlador NVIDIA >550), consulta la [documentación de Docker](/docs/pages/docs/docker.en-US.md):
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## APIs
VideoLingo admite formato de API similar a OpenAI y varias interfaces TTS:
- LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (ordenados por rendimiento, ten cuidado con gemini-2.5-flash...)
- WhisperX: Ejecuta whisperX localmente o usa la API de 302.ai
- TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(¡Puedes modificar tu propio TTS en custom_tts.py!)
> **Nota:** VideoLingo funciona con **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - una clave API para todos los servicios (LLM, WhisperX, TTS). ¡O ejecútalo localmente con Ollama y Edge-TTS gratis, sin necesidad de API!
Para instrucciones detalladas de instalación, configuración de API y modo por lotes, consulta la documentación: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md)
## Limitaciones Actuales
1. El rendimiento de transcripción de WhisperX puede verse afectado por el ruido de fondo del video, ya que utiliza el modelo wav2vac para la alineación. Para videos con música de fondo fuerte, activa la Mejora de Separación de Voz. Además, los subtítulos que terminan con números o caracteres especiales pueden truncarse temprano debido a la incapacidad de wav2vac para mapear caracteres numéricos (por ejemplo, "1") a su forma hablada ("uno").
2. El uso de modelos más débiles puede provocar errores durante los procesos intermedios debido a los estrictos requisitos de formato JSON para las respuestas. Si ocurre este error, elimina la carpeta `output` y vuelve a intentarlo con un LLM diferente, de lo contrario, la ejecución repetida leerá la respuesta errónea anterior causando el mismo error.
3. La función de doblaje puede no ser 100% perfecta debido a las diferencias en las velocidades de habla y entonación entre idiomas, así como al impacto del paso de traducción. Sin embargo, este proyecto ha implementado un extenso procesamiento de ingeniería para las velocidades de habla para garantizar los mejores resultados posibles de doblaje.
4. **El reconocimiento de transcripción de video multilingüe solo mantendrá el idioma principal**. Esto se debe a que whisperX utiliza un modelo especializado para un solo idioma al alinear forzosamente los subtítulos a nivel de palabra, y eliminará los idiomas no reconocidos.
5. **No se pueden doblar múltiples personajes por separado**, ya que la capacidad de distinción de hablantes de whisperX no es suficientemente confiable.
## 📄 Licencia
Este proyecto está licenciado bajo la Licencia Apache 2.0. Un agradecimiento especial a los siguientes proyectos de código abierto por sus contribuciones:
[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
## 📬 Contáctame
- Envía [Issues](https://github.com/Huanshere/VideoLingo/issues) o [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) en GitHub
- Envíame un DM en Twitter: [@Huanshere](https://twitter.com/Huanshere)
- Envíame un correo a: team@videolingo.io
## ⭐ Historial de Estrellas
[](https://star-history.com/#Huanshere/VideoLingo&Timeline)
---
Si encuentras útil VideoLingo, ¡por favor dame una ⭐️!
# Connecter le Monde, Image par Image
[**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Aperçu ([Essayez VL maintenant !](https://videolingo.io))
VideoLingo est un outil tout-en-un de traduction, de localisation et de doublage vidéo visant à générer des sous-titres de qualité Netflix. Il élimine les traductions automatiques rigides et les sous-titres multi-lignes tout en ajoutant un doublage de haute qualité, permettant le partage des connaissances à l'échelle mondiale au-delà des barrières linguistiques.
Fonctionnalités principales :
- 🎥 Téléchargement de vidéos YouTube via yt-dlp
- **🎙️ Reconnaissance de sous-titres au niveau des mots et à faible illusion avec WhisperX**
- **📝 Segmentation des sous-titres basée sur le NLP et l'IA**
- **📚 Terminologie personnalisée + générée par IA pour une traduction cohérente**
- **🔄 Processus en 3 étapes : Traduction-Réflexion-Adaptation pour une qualité cinématographique**
- **✅ Sous-titres uniquement sur une ligne, aux normes Netflix**
- **🗣️ Doublage avec GPT-SoVITS, Azure, OpenAI et plus**
- 🚀 Démarrage et traitement en un clic dans Streamlit
- 🌍 Support multi-langues dans l'interface utilisateur Streamlit
- 📝 Journalisation détaillée avec reprise de la progression
Différence par rapport aux projets similaires : **Sous-titres sur une seule ligne uniquement, qualité de traduction supérieure, expérience de doublage transparente**
## 🎥 Démo
### GPT-SoVITS avec ma voix
---
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Support des langues
**Support des langues d'entrée (d'autres à venir) :**
🇺🇸 Anglais 🤩 | 🇷🇺 Russe 😊 | 🇫🇷 Français 🤩 | 🇩🇪 Allemand 🤩 | 🇮🇹 Italien 🤩 | 🇪🇸 Espagnol 🤩 | 🇯🇵 Japonais 😐 | 🇨🇳 Chinois* 😊
> *Le chinois utilise un modèle whisper séparé amélioré par la ponctuation, pour l'instant...
**La traduction prend en charge toutes les langues, tandis que la langue de doublage dépend de la méthode TTS choisie.**
## Installation
Vous rencontrez un problème ? Discutez avec notre agent IA gratuit en ligne [**ici**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh) pour vous aider.
> **Note :** Pour les utilisateurs Windows avec un GPU NVIDIA, suivez ces étapes avant l'installation :
> 1. Installez [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
> 2. Installez [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. Ajoutez `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` à votre PATH système
> 4. Redémarrez votre ordinateur
> **Note :** FFmpeg est requis. Veuillez l'installer via les gestionnaires de paquets :
> - Windows : ```choco install ffmpeg``` (via [Chocolatey](https://chocolatey.org/))
> - macOS : ```brew install ffmpeg``` (via [Homebrew](https://brew.sh/))
> - Linux : ```sudo apt install ffmpeg``` (Debian/Ubuntu)
1. Clonez le dépôt
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. Installez les dépendances (nécessite `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. Démarrer l'application
```bash
streamlit run st.py
```
### Docker
Alternativement, vous pouvez utiliser Docker (nécessite CUDA 12.4 et NVIDIA Driver version >550), voir [Documentation Docker](/docs/pages/docs/docker.en-US.md) :
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## APIs
VideoLingo prend en charge le format d'API OpenAI et diverses interfaces TTS :
- LLM : `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (triés par performance, soyez prudent avec gemini-2.5-flash...)
- WhisperX : Exécutez whisperX localement ou utilisez l'API 302.ai
- TTS : `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(Vous pouvez modifier votre propre TTS dans custom_tts.py !)
> **Note :** VideoLingo fonctionne avec **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - une seule clé API pour tous les services (LLM, WhisperX, TTS). Ou exécutez localement avec Ollama et Edge-TTS gratuitement, sans API nécessaire !
Pour des instructions détaillées sur l'installation, la configuration de l'API et le mode batch, veuillez consulter la documentation : [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md)
## Limitations actuelles
1. Les performances de transcription de WhisperX peuvent être affectées par le bruit de fond de la vidéo, car il utilise le modèle wav2vac pour l'alignement. Pour les vidéos avec une musique de fond forte, veuillez activer l'amélioration de la séparation vocale. De plus, les sous-titres se terminant par des chiffres ou des caractères spéciaux peuvent être tronqués prématurément en raison de l'incapacité de wav2vac à mapper les caractères numériques (par exemple, "1") à leur forme parlée ("un").
2. L'utilisation de modèles plus faibles peut entraîner des erreurs lors des processus intermédiaires en raison des exigences strictes de format JSON pour les réponses. Si cette erreur se produit, veuillez supprimer le dossier `output` et réessayer avec un LLM différent, sinon l'exécution répétée lira la réponse erronée précédente causant la même erreur.
3. La fonction de doublage peut ne pas être parfaite à 100% en raison des différences de débit et d'intonation entre les langues, ainsi que de l'impact de l'étape de traduction. Cependant, ce projet a mis en œuvre un traitement d'ingénierie extensif pour les débits de parole afin d'assurer les meilleurs résultats de doublage possibles.
4. **La reconnaissance de transcription vidéo multilingue ne conservera que la langue principale**. C'est parce que whisperX utilise un modèle spécialisé pour une seule langue lors de l'alignement forcé des sous-titres au niveau des mots, et supprimera les langues non reconnues.
5. **Impossible de doubler séparément plusieurs personnages**, car la capacité de distinction des locuteurs de whisperX n'est pas suffisamment fiable.
## 📄 Licence
Ce projet est sous licence Apache 2.0. Remerciements spéciaux aux projets open source suivants pour leurs contributions :
[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
## 📬 Contactez-moi
- Soumettez des [Issues](https://github.com/Huanshere/VideoLingo/issues) ou des [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls) sur GitHub
- DM moi sur Twitter : [@Huanshere](https://twitter.com/Huanshere)
- Envoyez-moi un email à : team@videolingo.io
## ⭐ Historique des étoiles
[](https://star-history.com/#Huanshere/VideoLingo&Timeline)
---
Si vous trouvez VideoLingo utile, donnez-moi une ⭐️ !
# Объединяя Мир, Кадр за Кадром
[**English**](/README.md)|[**简体中文**](/translations/README.zh.md)|[**繁體中文**](/translations/README.zh-TW.md)|[**日本語**](/translations/README.ja.md)|[**Español**](/translations/README.es.md)|[**Русский**](/translations/README.ru.md)|[**Français**](/translations/README.fr.md)
## 🌟 Обзор ([Попробуйте VL бесплатно!](https://videolingo.io))
VideoLingo - это универсальный инструмент для перевода, локализации и дубляжа видео, направленный на создание субтитров качества Netflix. Он устраняет механические переводы и многострочные субтитры, добавляя высококачественный дубляж, что позволяет делиться знаниями по всему миру, преодолевая языковые барьеры.
Ключевые особенности:
- 🎥 Загрузка видео с YouTube через yt-dlp
- **🎙️ Пословное распознавание субтитров с низким уровнем искажений с помощью WhisperX**
- **📝 Сегментация субтитров на основе NLP и ИИ**
- **📚 Пользовательская + ИИ-генерируемая терминология для согласованного перевода**
- **🔄 3-этапный процесс Перевод-Осмысление-Адаптация для кинематографического качества**
- **✅ Только однострочные субтитры стандарта Netflix**
- **🗣️ Дубляж с помощью GPT-SoVITS, Azure, OpenAI и других**
- 🚀 Запуск и обработка в один клик в Streamlit
- 🌍 Многоязычная поддержка в интерфейсе Streamlit
- 📝 Подробное логирование с возможностью возобновления прогресса
Отличие от похожих проектов: **Только однострочные субтитры, превосходное качество перевода, безупречный опыт дубляжа**
## 🎥 Демонстрация
### GPT-SoVITS с моим голосом
---
https://github.com/user-attachments/assets/47d965b2-b4ab-4a0b-9d08-b49a7bf3508c
### Поддержка языков
**Поддержка входных языков (будет добавлено больше):**
🇺🇸 Английский 🤩 | 🇷🇺 Русский 😊 | 🇫🇷 Французский 🤩 | 🇩🇪 Немецкий 🤩 | 🇮🇹 Итальянский 🤩 | 🇪🇸 Испанский 🤩 | 🇯🇵 Японский 😐 | 🇨🇳 Китайский* 😊
> *Китайский пока использует отдельную модель whisper с улучшенной пунктуацией...
**Перевод поддерживает все языки, в то время как язык дубляжа зависит от выбранного метода TTS.**
## Установка
Возникли проблемы? Общайтесь с нашим бесплатным онлайн ИИ-агентом [**здесь**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh), который поможет вам.
> **Примечание:** Для пользователей Windows с GPU NVIDIA выполните следующие шаги перед установкой:
> 1. Установите [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
> 2. Установите [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. Добавьте `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` в системный PATH
> 4. Перезагрузите компьютер
> **Примечание:** Требуется FFmpeg. Установите его через менеджеры пакетов:
> - Windows: ```choco install ffmpeg``` (через [Chocolatey](https://chocolatey.org/))
> - macOS: ```brew install ffmpeg``` (через [Homebrew](https://brew.sh/))
> - Linux: ```sudo apt install ffmpeg``` (Debian/Ubuntu)
1. Клонируйте репозиторий
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. Установите зависимости (требуется `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. Запустите приложение
```bash
streamlit run st.py
```
### Docker
Альтернативно, вы можете использовать Docker (требуется CUDA 12.4 и версия драйвера NVIDIA >550), см. [документацию Docker](/docs/pages/docs/docker.en-US.md):
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## API
VideoLingo поддерживает формат API, подобный OpenAI, и различные интерфейсы TTS:
- LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ... (отсортировано по производительности, будьте осторожны с gemini-2.5-flash...)
- WhisperX: Запускайте whisperX локально или используйте API 302.ai
- TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(Вы можете модифицировать свой собственный TTS в custom_tts.py!)
> **Примечание:** VideoLingo работает с **[302.ai](https://gpt302.saaslink.net/C2oHR9)** - один API-ключ для всех сервисов (LLM, WhisperX, TTS). Или запускайте локально с Ollama и Edge-TTS бесплатно, без необходимости в API!
Для подробных инструкций по установке, настройке API и пакетному режиму обратитесь к документации: [English](/docs/pages/docs/start.en-US.md) | [中文](/docs/pages/docs/start.zh-CN.md)
## Текущие ограничения
1. Производительность транскрипции WhisperX может быть затронута фоновым шумом видео, так как для выравнивания используется модель wav2vac. Для видео с громкой фоновой музыкой включите Улучшение разделения голоса. Кроме того, субтитры, заканчивающиеся цифрами или специальными символами, могут быть обрезаны раньше из-за неспособности wav2vac сопоставлять цифровые символы (например, "1") с их произносимой формой ("один").
2. Использование более слабых моделей может привести к ошибкам во время промежуточных процессов из-за строгих требований к формату JSON для ответов. Если возникает эта ошибка, удалите пап
================================================
FILE: translations/README.zh-TW.md
================================================
### 语言支持
**输入语言支持:**
🇺🇸 英语 🤩 | 🇷🇺 俄语 😊 | 🇫🇷 法语 🤩 | 🇩🇪 德语 🤩 | 🇮🇹 意大利语 🤩 | 🇪🇸 西班牙语 🤩 | 🇯🇵 日语 😐 | 🇨🇳 中文* 😊
> *中文使用单独的标点增强后的 whisper 模型
**翻译语言支持所有语言,配音语言取决于选取的TTS。**
## 安装
遇到问题?在[**这里**](https://share.fastgpt.in/chat/share?shareId=066w11n3r9aq6879r4z0v9rh)与我们的免费在线AI助手交流获取帮助。
> **注意:** 在 Windows 上使用 NVIDIA GPU 加速需要先完成以下步骤:
> 1. 安装 [CUDA Toolkit 12.6](https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.76_windows.exe)
> 2. 安装 [CUDNN 9.3.0](https://developer.download.nvidia.com/compute/cudnn/9.3.0/local_installers/cudnn_9.3.0_windows.exe)
> 3. 将 `C:\Program Files\NVIDIA\CUDNN\v9.3\bin\12.6` 添加到系统环境变量 PATH 中
> 4. 重启电脑
> **注意:** FFmpeg 是必需的,请通过包管理器安装:
> - Windows:```choco install ffmpeg```(通过 [Chocolatey](https://chocolatey.org/))
> - macOS:```brew install ffmpeg```(通过 [Homebrew](https://brew.sh/))
> - Linux:```sudo apt install ffmpeg```(Debian/Ubuntu)
1. 克隆仓库
```bash
git clone https://github.com/Huanshere/VideoLingo.git
cd VideoLingo
```
2. 安装依赖(需要 `python=3.10`)
```bash
conda create -n videolingo python=3.10.0 -y
conda activate videolingo
python install.py
```
3. 启动应用
```bash
streamlit run st.py
```
### Docker
还可以选择使用 Docker(要求 CUDA 12.4 和 NVIDIA Driver 版本 >550),详见[Docker文档](/docs/pages/docs/docker.zh-CN.md):
```bash
docker build -t videolingo .
docker run -d -p 8501:8501 --gpus all videolingo
```
## API
本项目支持 OpenAI-Like 格式的 api 和多种配音接口:
- LLM: `claude-3-5-sonnet`, `gpt-4.1`, `deepseek-v3`, `gemini-2.0-flash`, ...(按效果排序,使用 gemini-2.5-flash 时需谨慎...)
- WhisperX: 本地运行 WhisperX 或使用 302.ai API
- TTS: `azure-tts`, `openai-tts`, `siliconflow-fishtts`, **`fish-tts`**, `GPT-SoVITS`, `edge-tts`, `*custom-tts`(你可以在 custom_tts.py 中自定义 TTS!)
> **注意:** VideoLingo 现已与 **[302.ai](https://gpt302.saaslink.net/C2oHR9)** 集成,**一个 API KEY** 即可同时支持 LLM、WhisperX 和 TTS!同时也支持完全本地部署,使用 Ollama 作为 LLM 和 Edge-TTS 作为配音,无需云端 API!
详细的安装、API 配置、批量说明可以参见文档:[English](/docs/pages/docs/start.en-US.md) | [简体中文](/docs/pages/docs/start.zh-CN.md)
## 当前限制
1. WhisperX 转录效果可能受到视频背景声影响,因为使用了 wav2vac 模型进行对齐。对于背景音乐较大的视频,请开启人声分离增强。另外,如果字幕以数字或特殊符号结尾,可能会导致提前截断,这是因为 wav2vac 无法将数字字符(如"1")映射到其发音形式("one")。
2. 使用较弱模型时容易在中间过程报错,这是因为对响应的 json 格式要求较为严格。如果出现此错误,请删除 `output` 文件夹后更换 llm 重试,否则重复执行会读取上次错误的响应导致同样错误。
3. 配音功能由于不同语言的语速和语调差异,还受到翻译步骤的影响,可能不能 100% 完美,但本项目做了非常多的语速上的工程处理,尽可能保证配音效果。
4. **多语言视频转录识别仅仅只会保留主要语言**,这是由于 whisperX 在强制对齐单词级字幕时使用的是针对单个语言的特化模型,会因为不认识另一种语言而删去。
5. **无法多角色分别配音**,whisperX 的说话人区分效果不够好用。
## 📄 许可证
本项目采用 Apache 2.0 许可证,衷心感谢以下开源项目的贡献:
[whisperX](https://github.com/m-bain/whisperX), [yt-dlp](https://github.com/yt-dlp/yt-dlp), [json_repair](https://github.com/mangiucugna/json_repair), [BELLE](https://github.com/LianjiaTech/BELLE)
## 📬 联系
- 加入 QQ 群寻求解答:875297969
- 在 GitHub 上提交 [Issues](https://github.com/Huanshere/VideoLingo/issues) 或 [Pull Requests](https://github.com/Huanshere/VideoLingo/pulls)
- 关注我的 Twitter:[@Huanshere](https://twitter.com/Huanshere)
- 联系邮箱:team@videolingo.io
## ⭐ Star History
[](https://star-history.com/#Huanshere/VideoLingo&Timeline)
================================================
FILE: translations/en.json
================================================
{
"a. Download or Upload Video": "a. Download or Upload Video",
"Delete and Reselect": "Delete and Reselect",
"Enter YouTube link:": "Enter YouTube link:",
"Resolution": "Resolution",
"Download Video": "Download Video",
"Or upload video": "Or upload video",
"Youtube Settings": "Youtube Settings",
"Cookies Path": "Cookies Path",
"LLM Configuration": "LLM Configuration",
"API_KEY": "API_KEY",
"BASE_URL": "BASE_URL",
"MODEL": "MODEL",
"Openai format, will add /v1/chat/completions automatically": "Openai format, will add /v1/chat/completions automatically",
"click to check API validity": "click to check API validity",
"API Key is valid": "API Key is valid",
"API Key is invalid": "API Key is invalid",
"Recog Lang": "Recog Lang",
"Subtitles Settings": "Subtitles Settings",
"Target Lang": "Target Lang",
"Input any language in natural language, as long as llm can understand": "Input any language in natural language, as long as llm can understand",
"Vocal separation enhance": "Vocal separation enhance",
"Burn-in Subtitles": "Burn-in Subtitles",
"Whether to burn subtitles into the video, will increase processing time": "Whether to burn subtitles into the video, will increase processing time",
"Video Resolution": "Video Resolution",
"Recommended for videos with loud background noise, but will increase processing time": "Recommended for videos with loud background noise, but will increase processing time",
"Dubbing Settings": "Dubbing Settings",
"TTS Method": "TTS Method",
"SiliconFlow API Key": "SiliconFlow API Key",
"Mode Selection": "Mode Selection",
"Preset": "Preset",
"Refer_stable": "Refer_stable",
"Refer_dynamic": "Refer_dynamic",
"OpenAI Voice": "OpenAI Voice",
"Fish TTS Character": "Fish TTS Character",
"Azure Voice": "Azure Voice",
"Please refer to Github homepage for GPT_SoVITS configuration": "Please refer to Github homepage for GPT_SoVITS configuration",
"SoVITS Character": "SoVITS Character",
"Refer Mode": "Refer Mode",
"Mode 1: Use provided reference audio only": "Mode 1: Use provided reference audio only",
"Mode 2: Use first audio from video as reference": "Mode 2: Use first audio from video as reference",
"Mode 3: Use each audio from video as reference": "Mode 3: Use each audio from video as reference",
"Configure reference audio mode for GPT-SoVITS": "Configure reference audio mode for GPT-SoVITS",
"Edge TTS Voice": "Edge TTS Voice",
"=====NOTE=====": "BELOW IS in st.py",
"b. Translate and Generate Subtitles": "b. Translate and Generate Subtitles",
"This stage includes the following steps:": "This stage includes the following steps:",
"WhisperX word-level transcription": "WhisperX word-level transcription",
"Sentence segmentation using NLP and LLM": "Sentence segmentation using NLP and LLM",
"Summarization and multi-step translation": "Summarization and multi-step translation",
"Cutting and aligning long subtitles": "Cutting and aligning long subtitles",
"Generating timeline and subtitles": "Generating timeline and subtitles",
"Merging subtitles into the video": "Merging subtitles into the video",
"Start Processing Subtitles": "Start Processing Subtitles",
"Download All Srt Files": "Download All Srt Files",
"Archive to 'history'": "Archive to 'history'",
"Using Whisper for transcription...": "Using Whisper for transcription...",
"Splitting long sentences...": "Splitting long sentences...",
"Summarizing and translating...": "Summarizing and translating...",
"Processing and aligning subtitles...": "Processing and aligning subtitles...",
"Merging subtitles to video...": "Merging subtitles to video...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...",
"Subtitle processing complete! 🎉": "Subtitle processing complete! 🎉",
"c. Dubbing": "c. Dubbing",
"Generate audio tasks and chunks": "Generate audio tasks and chunks",
"Extract reference audio": "Extract reference audio",
"Generate and merge audio files": "Generate and merge audio files",
"Merge final audio into video": "Merge final audio into video",
"Start Audio Processing": "Start Audio Processing",
"Audio processing is complete! You can check the audio files in the `output` folder.": "Audio processing is complete! You can check the audio files in the `output` folder.",
"Delete dubbing files": "Delete dubbing files",
"Generate audio tasks": "Generate audio tasks",
"Extract refer audio": "Extract refer audio",
"Generate all audio": "Generate all audio",
"Merge full audio": "Merge full audio",
"Merge dubbing to the video": "Merge dubbing to the video",
"Audio processing complete! 🎇": "Audio processing complete! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!",
"WhisperX Runtime": "WhisperX Runtime",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key",
"WhisperX 302ai API": "WhisperX 302ai API",
"=====NOTE2=====": "BELOW IS in install.py",
"🚀 Starting Installation": "🚀 Starting Installation",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.",
"❌ Failed to install requirements:": "❌ Failed to install requirements:",
"✅ FFmpeg is already installed": "✅ FFmpeg is already installed",
"❌ FFmpeg not found\n\n": "❌ FFmpeg not found\n\n",
"🛠️ Install using:": "🛠️ Install using:",
"💡 Note:": "💡 Note:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 After installing FFmpeg, please run this installer again:",
"Install Chocolatey first (https://chocolatey.org/)": "Install Chocolatey first (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "Install Homebrew first (https://brew.sh/)",
"Use your distribution's package manager": "Use your distribution's package manager",
"FFmpeg is required. Please install it and run the installer again.": "FFmpeg is required. Please install it and run the installer again.",
"Installing requirements using `pip install -r requirements.txt`": "Installing requirements using `pip install -r requirements.txt`",
"Installation completed": "Installation completed",
"Now I will run this command to start the application:": "Now I will run this command to start the application:",
"Note: First startup may take up to 1 minute": "Note: First startup may take up to 1 minute",
"If the application fails to start:": "If the application fails to start:",
"Check your network connection": "Check your network connection",
"Re-run the installer: [bold]python install.py[/bold]": "Re-run the installer: [bold]python install.py[/bold]",
"Detected NVIDIA GPU(s)": "Detected NVIDIA GPU(s)",
"No NVIDIA GPU detected": "No NVIDIA GPU detected",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "No NVIDIA GPU detected or NVIDIA drivers not properly installed",
"LLM JSON Format Support": "LLM JSON Format Support",
"Enable if your LLM supports JSON mode output": "Enable if your LLM supports JSON mode output"
}
================================================
FILE: translations/es.json
================================================
{
"a. Download or Upload Video": "a. Descargar o subir video",
"Delete and Reselect": "Eliminar y volver a seleccionar",
"Enter YouTube link:": "Ingrese el enlace de YouTube:",
"Resolution": "Resolución",
"Download Video": "Descargar video",
"Or upload video": "O subir video",
"Youtube Settings": "Configuración de Youtube",
"Cookies Path": "Ruta del archivo de Cookies",
"LLM Configuration": "Configuración de LLM",
"API_KEY": "Clave API",
"BASE_URL": "URL base",
"MODEL": "Modelo",
"Openai format, will add /v1/chat/completions automatically": "Formato OpenAI, se agregará /v1/chat/completions automáticamente",
"click to check API validity": "haga clic para verificar la validez de la API",
"API Key is valid": "La clave API es válida",
"API Key is invalid": "La clave API no es válida",
"Recog Lang": "Idioma de reconocimiento",
"Subtitles Settings": "Configuración de subtítulos",
"Target Lang": "Idioma objetivo",
"Input any language in natural language, as long as llm can understand": "Ingrese cualquier idioma en lenguaje natural, siempre que LLM pueda entenderlo",
"Vocal separation enhance": "Mejora de separación vocal",
"Burn-in Subtitles": "Incrustar subtítulos",
"Whether to burn subtitles into the video, will increase processing time": "Si se deben incrustar los subtítulos en el video, aumentará el tiempo de procesamiento",
"Video Resolution": "Resolución de video",
"Recommended for videos with loud background noise, but will increase processing time": "Recomendado para videos con ruido de fondo fuerte, pero aumentará el tiempo de procesamiento",
"Dubbing Settings": "Configuración de doblaje",
"TTS Method": "Método TTS",
"SiliconFlow API Key": "Clave API de SiliconFlow",
"Mode Selection": "Selección de modo",
"Preset": "Preestablecido",
"Refer_stable": "Referencia estable",
"Refer_dynamic": "Referencia dinámica",
"OpenAI Voice": "Voz de OpenAI",
"Fish TTS Character": "Personaje Fish TTS",
"Azure Voice": "Voz de Azure",
"Please refer to Github homepage for GPT_SoVITS configuration": "Consulte la página principal de Github para la configuración de GPT_SoVITS",
"SoVITS Character": "Personaje SoVITS",
"Refer Mode": "Modo de referencia",
"Mode 1: Use provided reference audio only": "Modo 1: Usar solo el audio de referencia proporcionado",
"Mode 2: Use first audio from video as reference": "Modo 2: Usar el primer audio del video como referencia",
"Mode 3: Use each audio from video as reference": "Modo 3: Usar cada audio del video como referencia",
"Configure reference audio mode for GPT-SoVITS": "Configurar modo de audio de referencia para GPT-SoVITS",
"Edge TTS Voice": "Voz Edge TTS",
"=====NOTE=====": "Lo siguiente es el contenido de st.py",
"b. Translate and Generate Subtitles": "b. Traducir y generar subtítulos",
"This stage includes the following steps:": "Esta etapa incluye los siguientes pasos:",
"WhisperX word-level transcription": "Transcripción a nivel de palabra WhisperX",
"Sentence segmentation using NLP and LLM": "Segmentación de oraciones usando NLP y LLM",
"Summarization and multi-step translation": "Resumen y traducción en múltiples pasos",
"Cutting and aligning long subtitles": "Cortar y alinear subtítulos largos",
"Generating timeline and subtitles": "Generar línea de tiempo y subtítulos",
"Merging subtitles into the video": "Fusionar subtítulos en el video",
"Start Processing Subtitles": "Comenzar procesamiento de subtítulos",
"Download All Srt Files": "Descargar todos los archivos Srt",
"Archive to 'history'": "Archivar en 'history'",
"Using Whisper for transcription...": "Usando Whisper para transcripción...",
"Splitting long sentences...": "Dividiendo oraciones largas...",
"Summarizing and translating...": "Resumiendo y traduciendo...",
"Processing and aligning subtitles...": "Procesando y alineando subtítulos...",
"Merging subtitles to video...": "Fusionando subtítulos al video...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSA_ANTES_DE_TRADUCIR. Vaya a `output/log/terminology.json` para editar la terminología. Luego presione ENTER para continuar...",
"Subtitle processing complete! 🎉": "¡Procesamiento de subtítulos completado! 🎉",
"c. Dubbing": "c. Doblaje",
"Generate audio tasks and chunks": "Generar tareas y fragmentos de audio",
"Extract reference audio": "Extraer audio de referencia",
"Generate and merge audio files": "Generar y fusionar archivos de audio",
"Merge final audio into video": "Fusionar audio final en el video",
"Start Audio Processing": "Comenzar procesamiento de audio",
"Audio processing is complete! You can check the audio files in the `output` folder.": "¡El procesamiento de audio está completo! Puede verificar los archivos de audio en la carpeta `output`.",
"Delete dubbing files": "Eliminar archivos de doblaje",
"Generate audio tasks": "Generar tareas de audio",
"Extract refer audio": "Extraer audio de referencia",
"Generate all audio": "Generar todo el audio",
"Merge full audio": "Fusionar audio completo",
"Merge dubbing to the video": "Fusionar doblaje al video",
"Audio processing complete! 🎇": "¡Procesamiento de audio completado! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Hola, bienvenido a VideoLingo. Si encuentra algún problema, no dude en obtener respuestas instantáneas con nuestro Agente de preguntas y respuestas gratuito aquí. ¡También puede probar gratis nuestro sitio web SaaS en videolingo.io!",
"WhisperX Runtime": "Entorno de WhisperX",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "El entorno local requiere GPU >8GB, el entorno en la nube requiere clave API 302ai, el entorno elevenlabs requiere clave API ElevenLabs",
"WhisperX 302ai API": "API 302ai de WhisperX",
"=====NOTE2=====": "A continuación está en install.py",
"🚀 Starting Installation": "🚀 Iniciando instalación",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "¿Necesita configurar automáticamente los espejos PyPI? (Recomendado si tiene dificultades para acceder a pypi.org)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 GPU NVIDIA detectada, instalando versión CUDA de PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS detectado, instalando versión CPU de PyTorch... Nota: puede ser lento durante la transcripción de whisperX.",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 No se detectó GPU NVIDIA, instalando versión CPU de PyTorch... Nota: puede ser lento durante la transcripción de whisperX.",
"❌ Failed to install requirements:": "❌ Error al instalar los requisitos:",
"✅ FFmpeg is already installed": "✅ FFmpeg ya está instalado",
"❌ FFmpeg not found\n\n": "❌ FFmpeg no encontrado\n\n",
"🛠️ Install using:": "🛠️ Instalar usando:",
"💡 Note:": "💡 Nota:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 Después de instalar FFmpeg, ejecute este instalador nuevamente:",
"Install Chocolatey first (https://chocolatey.org/)": "Instale Chocolatey primero (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "Instale Homebrew primero (https://brew.sh/)",
"Use your distribution's package manager": "Use el gestor de paquetes de su distribución",
"FFmpeg is required. Please install it and run the installer again.": "Se requiere FFmpeg. Por favor instálelo y ejecute el instalador nuevamente.",
"Installation completed": "Instalación completada",
"Now I will run this command to start the application:": "Ahora ejecutaré este comando para iniciar la aplicación:",
"Note: First startup may take up to 1 minute": "Nota: El primer inicio puede tardar hasta 1 minuto",
"If the application fails to start:": "Si la aplicación no se inicia:",
"Check your network connection": "Compruebe su conexión de red",
"Re-run the installer: [bold]python install.py[/bold]": "Vuelva a ejecutar el instalador: [bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "Instalando dependencias usando `pip install -r requirements.txt`",
"Detected NVIDIA GPU(s)": "GPU(s) NVIDIA detectada(s)",
"No NVIDIA GPU detected": "No se detectó GPU NVIDIA",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "No se detectó GPU NVIDIA o los controladores NVIDIA no están instalados correctamente",
"LLM JSON Format Support": "Soporte de formato JSON para LLM",
"Enable if your LLM supports JSON mode output": "Activar si su LLM admite salida en modo JSON"
}
================================================
FILE: translations/fr.json
================================================
{
"a. Download or Upload Video": "a. Télécharger ou importer une vidéo",
"Delete and Reselect": "Supprimer et resélectionner",
"Enter YouTube link:": "Entrez le lien YouTube :",
"Resolution": "Résolution",
"Download Video": "Télécharger la vidéo",
"Or upload video": "Ou importer une vidéo",
"Youtube Settings": "Paramètres Youtube",
"Cookies Path": "Chemin du fichier Cookies",
"LLM Configuration": "Configuration LLM",
"API_KEY": "Clé API",
"BASE_URL": "URL de base",
"MODEL": "Modèle",
"Openai format, will add /v1/chat/completions automatically": "Format OpenAI, /v1/chat/completions sera ajouté automatiquement",
"click to check API validity": "Cliquez pour vérifier la validité de l'API",
"API Key is valid": "La clé API est valide",
"API Key is invalid": "La clé API n'est pas valide",
"Recog Lang": "Langue de reconnaissance",
"Subtitles Settings": "Paramètres des sous-titres",
"Target Lang": "Langue cible",
"Input any language in natural language, as long as llm can understand": "Saisissez n'importe quelle langue en langage naturel, tant que le LLM peut la comprendre",
"Vocal separation enhance": "Amélioration de la séparation vocale",
"Burn-in Subtitles": "Incruster les sous-titres",
"Whether to burn subtitles into the video, will increase processing time": "Pour incruster les sous-titres dans la vidéo, cela augmentera le temps de traitement",
"Video Resolution": "Résolution vidéo",
"Recommended for videos with loud background noise, but will increase processing time": "Recommandé pour les vidéos avec beaucoup de bruit de fond, mais augmente le temps de traitement",
"Dubbing Settings": "Paramètres de doublage",
"TTS Method": "Méthode TTS",
"SiliconFlow API Key": "Clé API SiliconFlow",
"Mode Selection": "Sélection du mode",
"Preset": "Préréglage",
"Refer_stable": "Référence stable",
"Refer_dynamic": "Référence dynamique",
"OpenAI Voice": "Voix OpenAI",
"Fish TTS Character": "Personnage Fish TTS",
"Azure Voice": "Voix Azure",
"Please refer to Github homepage for GPT_SoVITS configuration": "Veuillez consulter la page Github pour la configuration GPT_SoVITS",
"SoVITS Character": "Personnage SoVITS",
"Refer Mode": "Mode de référence",
"Mode 1: Use provided reference audio only": "Mode 1 : Utiliser uniquement l'audio de référence fourni",
"Mode 2: Use first audio from video as reference": "Mode 2 : Utiliser le premier audio de la vidéo comme référence",
"Mode 3: Use each audio from video as reference": "Mode 3 : Utiliser chaque audio de la vidéo comme référence",
"Configure reference audio mode for GPT-SoVITS": "Configurer le mode audio de référence pour GPT-SoVITS",
"Edge TTS Voice": "Voix Edge TTS",
"=====NOTE=====": "Ce qui suit est dans st.py",
"=====NOTE2=====": "Ce qui suit est dans install.py",
"🚀 Starting Installation": "🚀 Démarrage de l'installation",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Voulez-vous configurer automatiquement les miroirs PyPI ? (Recommandé si vous avez des difficultés à accéder à pypi.org)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 GPU NVIDIA détecté, installation de la version CUDA de PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOS détecté, installation de la version CPU de PyTorch... Note : la transcription whisperX peut être lente.",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 Aucun GPU NVIDIA détecté, installation de la version CPU de PyTorch... Note : la transcription whisperX peut être lente.",
"❌ Failed to install requirements:": "❌ Échec de l'installation des prérequis :",
"✅ FFmpeg is already installed": "✅ FFmpeg est déjà installé",
"❌ FFmpeg not found\n\n": "❌ FFmpeg non trouvé\n\n",
"🛠️ Install using:": "🛠️ Installer avec :",
"💡 Note:": "💡 Note :",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 Après l'installation de FFmpeg, veuillez relancer cet installateur :",
"Install Chocolatey first (https://chocolatey.org/)": "Installez d'abord Chocolatey (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "Installez d'abord Homebrew (https://brew.sh/)",
"Use your distribution's package manager": "Utilisez le gestionnaire de paquets de votre distribution",
"FFmpeg is required. Please install it and run the installer again.": "FFmpeg est requis. Veuillez l'installer et relancer l'installateur.",
"Installation completed": "Installation terminée",
"Now I will run this command to start the application:": "Je vais maintenant exécuter cette commande pour démarrer l'application :",
"Note: First startup may take up to 1 minute": "Note : Le premier démarrage peut prendre jusqu'à 1 minute",
"If the application fails to start:": "Si l'application ne démarre pas :",
"Check your network connection": "Vérifiez votre connexion réseau",
"Re-run the installer: [bold]python install.py[/bold]": "Relancez l'installateur : [bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "Installation des dépendances avec `pip install -r requirements.txt`",
"b. Translate and Generate Subtitles": "b. Traduire et générer les sous-titres",
"This stage includes the following steps:": "Cette étape comprend les étapes suivantes :",
"WhisperX word-level transcription": "Transcription au niveau des mots WhisperX",
"Sentence segmentation using NLP and LLM": "Segmentation des phrases utilisant NLP et LLM",
"Summarization and multi-step translation": "Résumé et traduction en plusieurs étapes",
"Cutting and aligning long subtitles": "Découpage et alignement des longs sous-titres",
"Generating timeline and subtitles": "Génération de la chronologie et des sous-titres",
"Merging subtitles into the video": "Fusion des sous-titres dans la vidéo",
"Start Processing Subtitles": "Démarrer le traitement des sous-titres",
"Download All Srt Files": "Télécharger tous les fichiers Srt",
"Archive to 'history'": "Archiver dans 'history'",
"Using Whisper for transcription...": "Utilisation de Whisper pour la transcription...",
"Splitting long sentences...": "Division des longues phrases...",
"Summarizing and translating...": "Résumé et traduction en cours...",
"Processing and aligning subtitles...": "Traitement et alignement des sous-titres...",
"Merging subtitles to video...": "Fusion des sous-titres dans la vidéo...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ PAUSE_AVANT_TRADUCTION. Allez dans `output/log/terminology.json` pour éditer la terminologie. Puis appuyez sur ENTRÉE pour continuer...",
"Subtitle processing complete! 🎉": "Traitement des sous-titres terminé ! 🎉",
"c. Dubbing": "c. Doublage",
"Generate audio tasks and chunks": "Générer les tâches audio et les segments",
"Extract reference audio": "Extraire l'audio de référence",
"Generate and merge audio files": "Générer et fusionner les fichiers audio",
"Merge final audio into video": "Fusionner l'audio final dans la vidéo",
"Start Audio Processing": "Démarrer le traitement audio",
"Audio processing is complete! You can check the audio files in the `output` folder.": "Le traitement audio est terminé ! Vous pouvez vérifier les fichiers audio dans le dossier `output`.",
"Delete dubbing files": "Supprimer les fichiers de doublage",
"Generate audio tasks": "Générer les tâches audio",
"Extract refer audio": "Extraire l'audio de référence",
"Generate all audio": "Générer tout l'audio",
"Merge full audio": "Fusionner l'audio complet",
"Merge dubbing to the video": "Fusionner le doublage dans la vidéo",
"Audio processing complete! 🎇": "Traitement audio terminé ! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Bonjour, bienvenue sur VideoLingo. Si vous rencontrez des problèmes, n'hésitez pas à obtenir des réponses instantanées avec notre Agent QA gratuit ici ! Vous pouvez également essayer gratuitement notre site web SaaS sur videolingo.io !",
"WhisperX Runtime": "Environnement WhisperX",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "L'environnement local nécessite un GPU >8GB, l'environnement cloud nécessite une clé API 302ai, l'environnement elevenlabs nécessite une clé API ElevenLabs",
"WhisperX 302ai API": "API 302ai WhisperX",
"Detected NVIDIA GPU(s)": "GPU(s) NVIDIA détecté(s)",
"No NVIDIA GPU detected": "Aucun GPU NVIDIA détecté",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "Aucun GPU NVIDIA détecté ou pilotes NVIDIA mal installés",
"LLM JSON Format Support": "Support du format JSON pour LLM",
"Enable if your LLM supports JSON mode output": "Activer si votre LLM prend en charge la sortie en mode JSON"
}
================================================
FILE: translations/ja.json
================================================
{
"a. Download or Upload Video": "a. 動画のダウンロードまたはアップロード",
"Delete and Reselect": "削除して再選択",
"Enter YouTube link:": "YouTubeリンクを入力:",
"Resolution": "解像度",
"Download Video": "動画をダウンロード",
"Or upload video": "または動画をアップロード",
"Youtube Settings": "Youtube 設定",
"Cookies Path": "Cookieファイルのパス",
"LLM Configuration": "LLM設定",
"API_KEY": "APIキー",
"BASE_URL": "ベースURL",
"MODEL": "モデル",
"Openai format, will add /v1/chat/completions automatically": "OpenAI形式、/v1/chat/completionsが自動的に追加されます",
"click to check API validity": "クリックしてAPIの有効性を確認",
"API Key is valid": "APIキーは有効です",
"API Key is invalid": "APIキーが無効です",
"Recog Lang": "認識言語",
"Subtitles Settings": "字幕設定",
"Target Lang": "目標言語",
"Input any language in natural language, as long as llm can understand": "LLMが理解できる限り、自然言語で任意の言語を入力してください",
"Vocal separation enhance": "音声分離強化",
"Burn-in Subtitles": "字幕を焼き付け",
"Whether to burn subtitles into the video, will increase processing time": "字幕を動画に焼き付けるかどうか、処理時間が増加します",
"Video Resolution": "動画解像度",
"Recommended for videos with loud background noise, but will increase processing time": "背景ノイズの大きい動画に推奨されますが、処理時間が増加します",
"Dubbing Settings": "吹き替え設定",
"TTS Method": "TTS方式",
"SiliconFlow API Key": "SiliconFlow APIキー",
"Mode Selection": "モード選択",
"Preset": "プリセット",
"Refer_stable": "安定参照",
"Refer_dynamic": "動的参照",
"OpenAI Voice": "OpenAI音声",
"Fish TTS Character": "Fish TTSキャラクター",
"Azure Voice": "Azure音声",
"Please refer to Github homepage for GPT_SoVITS configuration": "GPT_SoVITSの設定についてはGithubホームページを参照してください",
"SoVITS Character": "SoVITSキャラクター",
"Refer Mode": "参照モード",
"Mode 1: Use provided reference audio only": "モード1:提供された参照音声のみを使用",
"Mode 2: Use first audio from video as reference": "モード2:動画の最初の音声を参照として使用",
"Mode 3: Use each audio from video as reference": "モード3:動画の各音声を参照として使用",
"Configure reference audio mode for GPT-SoVITS": "GPT-SoVITSの参照音声モードを設定",
"Edge TTS Voice": "Edge TTS音声",
"=====NOTE=====": "以下はst.pyの内容です",
"b. Translate and Generate Subtitles": "b. 翻訳と字幕生成",
"This stage includes the following steps:": "このステージには以下の手順が含まれます:",
"WhisperX word-level transcription": "WhisperX単語レベル文字起こし",
"Sentence segmentation using NLP and LLM": "NLPとLLMを使用した文章分割",
"Summarization and multi-step translation": "要約と多段階翻訳",
"Cutting and aligning long subtitles": "長い字幕の切断と整列",
"Generating timeline and subtitles": "タイムラインと字幕の生成",
"Merging subtitles into the video": "字幕を動画に統合",
"Start Processing Subtitles": "字幕処理を開始",
"Download All Srt Files": "すべてのSrtファイルをダウンロード",
"Archive to 'history'": "'history'にアーカイブ",
"Using Whisper for transcription...": "Whisperで文字起こしを実行中...",
"Splitting long sentences...": "長文を分割中...",
"Summarizing and translating...": "要約と翻訳中...",
"Processing and aligning subtitles...": "字幕の処理と整列中...",
"Merging subtitles to video...": "字幕を動画に統合中...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻訳前に一時停止。`output/log/terminology.json`で用語を編集してください。その後、Enterキーを押して続行...",
"Subtitle processing complete! 🎉": "字幕処理完了!🎉",
"c. Dubbing": "c. 吹き替え",
"Generate audio tasks and chunks": "音声タスクとチャンクの生成",
"Extract reference audio": "参照音声の抽出",
"Generate and merge audio files": "音声ファイルの生成と統合",
"Merge final audio into video": "最終音声を動画に統合",
"Start Audio Processing": "音声処理を開始",
"Audio processing is complete! You can check the audio files in the `output` folder.": "音声処理が完了しました!`output`フォルダで音声ファイルを確認できます。",
"Delete dubbing files": "吹き替えファイルを削除",
"Generate audio tasks": "音声タスクを生成",
"Extract refer audio": "参照音声を抽出",
"Generate all audio": "すべての音声を生成",
"Merge full audio": "完全な音声を統合",
"Merge dubbing to the video": "吹き替えを動画に統合",
"Audio processing complete! 🎇": "音声処理完了!🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "VideoLingoへようこそ。問題が発生した場合は、無料のQAエージェントこちらで即座に回答を得ることができます!また、SaaSウェブサイトvideolingo.ioを無料でお試しいただけます!",
"WhisperX Runtime": "WhisperX ランタイム",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "ローカルランタイムは8GB以上のGPUが必要、クラウドランタイムは302ai APIキーが必要です、elevenlabsランタイムはElevenLabs APIキーが必要です",
"WhisperX 302ai API": "WhisperX 302ai API",
"=====NOTE2=====": "以下はinstall.pyの内容です",
"🚀 Starting Installation": "🚀 インストールを開始",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "PyPIミラーを自動設定しますか?(pypi.orgへのアクセスが困難な場合は推奨)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 NVIDIA GPUを検出、PyTorchのCUDAバージョンをインストール中...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 MacOSを検出、PyTorchのCPUバージョンをインストール中... 注:whisperX文字起こし時に遅くなる可能性があります。",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 NVIDIA GPUが検出されません、PyTorchのCPUバージョンをインストール中... 注:whisperX文字起こし時に遅くなる可能性があります。",
"❌ Failed to install requirements:": "❌ 要件のインストールに失敗:",
"✅ FFmpeg is already installed": "✅ FFmpegはすでにインストールされています",
"❌ FFmpeg not found\n\n": "❌ FFmpegが見つかりません\n\n",
"🛠️ Install using:": "🛠️ インストール方法:",
"💡 Note:": "💡 注意:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 FFmpegをインストールした後、このインストーラーを再度実行してください:",
"Install Chocolatey first (https://chocolatey.org/)": "最初にChocolateyをインストールしてください (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "最初にHomebrewをインストールしてください (https://brew.sh/)",
"Use your distribution's package manager": "お使いのディストリビューションのパッケージマネージャーを使用してください",
"FFmpeg is required. Please install it and run the installer again.": "FFmpegが必要です。インストールして、インストーラーを再度実行してください。",
"Installation completed": "インストール完了",
"Now I will run this command to start the application:": "次のコマンドでアプリケーションを起動します:",
"Note: First startup may take up to 1 minute": "注:初回起動には最大1分かかる場合があります",
"If the application fails to start:": "アプリケーションが起動しない場合:",
"Check your network connection": "ネットワーク接続を確認してください",
"Re-run the installer: [bold]python install.py[/bold]": "インストーラーを再実行: [bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "依存関係を `pip install -r requirements.txt` でインストール中",
"Detected NVIDIA GPU(s)": "NVIDIA GPUを検出しました",
"No NVIDIA GPU detected": "NVIDIA GPUが検出されません",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "NVIDIA GPUが検出されないか、NVIDIAドライバーが正しくインストールされていません",
"LLM JSON Format Support": "LLM JSON形式サポート",
"Enable if your LLM supports JSON mode output": "LLMがJSON出力モードをサポートしている場合に有効化"
}
================================================
FILE: translations/ru.json
================================================
{
"a. Download or Upload Video": "a. Скачать или загрузить видео",
"Delete and Reselect": "Удалить и выбрать заново",
"Enter YouTube link:": "Введите ссылку YouTube:",
"Resolution": "Разрешение",
"Download Video": "Скачать видео",
"Or upload video": "Или загрузить видео",
"Youtube Settings": "Настройки Youtube",
"Cookies Path": "Путь к файлу Cookies",
"LLM Configuration": "Настройка LLM",
"API_KEY": "API-ключ",
"BASE_URL": "Базовый URL",
"MODEL": "Модель",
"Openai format, will add /v1/chat/completions automatically": "Формат OpenAI, /v1/chat/completions добавится автоматически",
"click to check API validity": "нажмите для проверки API",
"API Key is valid": "API-ключ действителен",
"API Key is invalid": "API-ключ недействителен",
"Recog Lang": "Язык распознавания",
"Subtitles Settings": "Настройки субтитров",
"Target Lang": "Целевой язык",
"Input any language in natural language, as long as llm can understand": "Введите любой язык на естественном языке, главное чтобы LLM мог понять",
"Vocal separation enhance": "Улучшение отделения голоса",
"Burn-in Subtitles": "Встроить субтитры",
"Whether to burn subtitles into the video, will increase processing time": "Встраивать ли субтитры в видео, это увеличит время обработки",
"Video Resolution": "Разрешение видео",
"Recommended for videos with loud background noise, but will increase processing time": "Рекомендуется для видео с громким фоновым шумом, но увеличит время обработки",
"Dubbing Settings": "Настройки дубляжа",
"TTS Method": "Метод TTS",
"SiliconFlow API Key": "API-ключ SiliconFlow",
"Mode Selection": "Выбор режима",
"Preset": "Пресет",
"Refer_stable": "Стабильная ссылка",
"Refer_dynamic": "Динамическая ссылка",
"OpenAI Voice": "Голос OpenAI",
"Fish TTS Character": "Персонаж Fish TTS",
"Azure Voice": "Голос Azure",
"Please refer to Github homepage for GPT_SoVITS configuration": "Обратитесь к домашней странице Github для настройки GPT_SoVITS",
"SoVITS Character": "Персонаж SoVITS",
"Refer Mode": "Режим ссылки",
"Mode 1: Use provided reference audio only": "Режим 1: Использовать только предоставленное эталонное аудио",
"Mode 2: Use first audio from video as reference": "Режим 2: Использовать первое аудио из видео как эталон",
"Mode 3: Use each audio from video as reference": "Режим 3: Использовать каждое аудио из видео как эталон",
"Configure reference audio mode for GPT-SoVITS": "Настройка режима эталонного аудио для GPT-SoVITS",
"Edge TTS Voice": "Голос Edge TTS",
"=====NOTE=====": "Содержимое st.py ниже",
"=====NOTE2=====": "Ниже содержится в install.py",
"🚀 Starting Installation": "🚀 Начало установки",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "Нужно ли автоматически настроить зеркала PyPI? (Рекомендуется при проблемах с доступом к pypi.org)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 Обнаружен GPU NVIDIA, установка CUDA версии PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 Обнаружена MacOS, установка CPU версии PyTorch... Примечание: транскрипция whisperX может быть медленной.",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 GPU NVIDIA не обнаружен, установка CPU версии PyTorch... Примечание: транскрипция whisperX может быть медленной.",
"❌ Failed to install requirements:": "❌ Не удалось установить зависимости:",
"✅ FFmpeg is already installed": "✅ FFmpeg уже установлен",
"❌ FFmpeg not found\n\n": "❌ FFmpeg не найден\n\n",
"🛠️ Install using:": "🛠️ Установить используя:",
"💡 Note:": "💡 Примечание:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 После установки FFmpeg, пожалуйста, запустите установщик снова:",
"Install Chocolatey first (https://chocolatey.org/)": "Сначала установите Chocolatey (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "Сначала установите Homebrew (https://brew.sh/)",
"Use your distribution's package manager": "Используйте менеджер пакетов вашего дистрибутива",
"FFmpeg is required. Please install it and run the installer again.": "Требуется FFmpeg. Пожалуйста, установите его и запустите установщик снова.",
"Installation completed": "Установка завершена",
"Now I will run this command to start the application:": "Сейчас я запущу эту команду для запуска приложения:",
"Note: First startup may take up to 1 minute": "Примечание: Первый запуск может занять до 1 минуты",
"If the application fails to start:": "Если приложение не запускается:",
"Check your network connection": "Проверьте подключение к сети",
"Re-run the installer: [bold]python install.py[/bold]": "Перезапустите установщик: [bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "Установка зависимостей с помощью `pip install -r requirements.txt`",
"b. Translate and Generate Subtitles": "b. Перевести и создать субтитры",
"This stage includes the following steps:": "Этот этап включает следующие шаги:",
"WhisperX word-level transcription": "Пословная транскрипция WhisperX",
"Sentence segmentation using NLP and LLM": "Сегментация предложений с помощью NLP и LLM",
"Summarization and multi-step translation": "Обобщение и многоэтапный перевод",
"Cutting and aligning long subtitles": "Разделение и выравнивание длинных субтитров",
"Generating timeline and subtitles": "Создание таймлайна и субтитров",
"Merging subtitles into the video": "Объединение субтитров с видео",
"Start Processing Subtitles": "Начать обработку субтитров",
"Download All Srt Files": "Скачать все Srt файлы",
"Archive to 'history'": "Архивировать в 'history'",
"Using Whisper for transcription...": "Используется Whisper для транскрипции...",
"Splitting long sentences...": "Разделение длинных предложений...",
"Summarizing and translating...": "Обобщение и перевод...",
"Processing and aligning subtitles...": "Обработка и выравнивание субтитров...",
"Merging subtitles to video...": "Объединение субтитров с видео...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ ПАУЗА_ПЕРЕД_ПЕРЕВОДОМ. Перейдите в `output/log/terminology.json` для редактирования терминологии. Затем нажмите ENTER для продолжения...",
"Subtitle processing complete! 🎉": "Обработка субтитров завершена! 🎉",
"c. Dubbing": "c. Дубляж",
"Generate audio tasks and chunks": "Создание аудио задач и фрагментов",
"Extract reference audio": "Извлечение эталонного аудио",
"Generate and merge audio files": "Создание и объединение аудио файлов",
"Merge final audio into video": "Объединение финального аудио с видео",
"Start Audio Processing": "Начать обработку аудио",
"Audio processing is complete! You can check the audio files in the `output` folder.": "Обработка аудио завершена! Вы можете проверить аудио файлы в папке `output`.",
"Delete dubbing files": "Удалить файлы дубляжа",
"Generate audio tasks": "Создать аудио задачи",
"Extract refer audio": "Извлечь эталонное аудио",
"Generate all audio": "Создать все аудио",
"Merge full audio": "Объединить полное аудио",
"Merge dubbing to the video": "Объединить дубляж с видео",
"Audio processing complete! 🎇": "Обработка аудио завершена! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "Здравствуйте, добро пожаловать в VideoLingo. Если у вас возникнут вопросы, вы можете получить мгновенные ответы с помощью нашего бесплатного QA-агента здесь! Вы также можете бесплатно попробовать наш SaaS-сайт videolingo.io!",
"WhisperX Runtime": "Среда выполнения WhisperX",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "Локальная среда требует GPU >8ГБ, облачная среда требует API-ключ 302ai, elevenlabs среда требует API-ключ ElevenLabs",
"WhisperX 302ai API": "API 302ai WhisperX",
"Detected NVIDIA GPU(s)": "Обнаружен(ы) GPU NVIDIA",
"No NVIDIA GPU detected": "GPU NVIDIA не обнаружен",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "GPU NVIDIA не обнаружен или драйверы NVIDIA установлены неправильно",
"LLM JSON Format Support": "Поддержка формата JSON для LLM",
"Enable if your LLM supports JSON mode output": "Включите, если ваш LLM поддерживает вывод в формате JSON"
}
================================================
FILE: translations/translations.py
================================================
import json
DISPLAY_LANGUAGES = {
"🇬🇧 English": "en",
"🇨🇳 简体中文": "zh-CN",
"🇭🇰 繁体中文": "zh-HK",
"🇯🇵 日本語": "ja",
"🇪🇸 Español": "es",
"🇷🇺 Русский": "ru",
"🇫🇷 Français": "fr",
}
# Load the language file based on user selection
def load_translations(language="en"):
with open(f'translations/{language}.json', 'r', encoding='utf-8') as file:
return json.load(file)
# Function to fetch the translation
def translate(key):
from core.utils.config_utils import load_key
try:
display_language = load_key("display_language")
translations = load_translations(display_language)
translation = translations.get(key)
if translation is None:
print(f"Warning: Translation not found for key '{key}' in language '{display_language}'")
return key
return translation
except:
return key
================================================
FILE: translations/zh-CN.json
================================================
{
"a. Download or Upload Video": "a. 下载或上传视频",
"Delete and Reselect": "删除并重新选择",
"Enter YouTube link:": "输入YouTube链接:",
"Resolution": "分辨率",
"Download Video": "下载视频",
"Or upload video": "或上传视频",
"Youtube Settings": "Youtube设置",
"Cookies Path": "Cookies文件路径",
"LLM Configuration": "LLM配置",
"API_KEY": "API密钥",
"BASE_URL": "BASE_URL",
"MODEL": "模型",
"Openai format, will add /v1/chat/completions automatically": "OpenAI格式,将自动添加/v1/chat/completions",
"click to check API validity": "点击检查API有效性",
"API Key is valid": "API密钥有效",
"API Key is invalid": "API密钥无效",
"Recog Lang": "识别语言",
"Subtitles Settings": "字幕设置",
"Target Lang": "目标语言",
"Input any language in natural language, as long as llm can understand": "用自然语言输入任何语言,只要LLM能理解即可",
"Vocal separation enhance": "人声分离增强",
"Burn-in Subtitles": "烧录字幕",
"Whether to burn subtitles into the video, will increase processing time": "是否将字幕烧录到视频中,会增加处理时间",
"Video Resolution": "视频分辨率",
"Recommended for videos with loud background noise, but will increase processing time": "推荐用于背景噪音较大的视频,但会增加处理时间",
"Dubbing Settings": "配音设置",
"TTS Method": "TTS方法",
"SiliconFlow API Key": "SiliconFlow API密钥",
"Mode Selection": "模式选择",
"Preset": "预设",
"Refer_stable": "稳定参考",
"Refer_dynamic": "动态参考",
"OpenAI Voice": "OpenAI语音",
"Fish TTS Character": "Fish TTS角色",
"Azure Voice": "Azure语音",
"Please refer to Github homepage for GPT_SoVITS configuration": "请参考Github主页了解GPT_SoVITS配置",
"SoVITS Character": "SoVITS角色",
"Refer Mode": "参考模式",
"Mode 1: Use provided reference audio only": "模式1:仅使用提供的参考音频",
"Mode 2: Use first audio from video as reference": "模式2:使用视频中的第一段音频作为参考",
"Mode 3: Use each audio from video as reference": "模式3:使用视频中的每段音频作为参考",
"Configure reference audio mode for GPT-SoVITS": "配置GPT-SoVITS的参考音频模式",
"Edge TTS Voice": "Edge TTS语音",
"=====NOTE=====": "以下是st.py中的内容",
"b. Translate and Generate Subtitles": "b. 翻译并生成字幕",
"This stage includes the following steps:": "此阶段包含以下步骤:",
"WhisperX word-level transcription": "WhisperX词级转录",
"Sentence segmentation using NLP and LLM": "使用NLP和LLM进行句子分段",
"Summarization and multi-step translation": "摘要和多步翻译",
"Cutting and aligning long subtitles": "切割和对齐长字幕",
"Generating timeline and subtitles": "生成时间轴和字幕",
"Merging subtitles into the video": "将字幕合并到视频中",
"Start Processing Subtitles": "开始处理字幕",
"Download All Srt Files": "下载所有Srt文件",
"Archive to 'history'": "归档到'history'",
"Using Whisper for transcription...": "正在使用Whisper进行转录...",
"Splitting long sentences...": "正在分割长句...",
"Summarizing and translating...": "正在总结和翻译...",
"Processing and aligning subtitles...": "正在处理和对齐字幕...",
"Merging subtitles to video...": "正在将字幕合并到视频...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻译前暂停。请前往`output/log/terminology.json`编辑术语。然后按回车键继续...",
"Subtitle processing complete! 🎉": "字幕处理完成! 🎉",
"c. Dubbing": "c. 配音",
"Generate audio tasks and chunks": "生成音频任务和分块",
"Extract reference audio": "提取参考音频",
"Generate and merge audio files": "生成和合并音频文件",
"Merge final audio into video": "将最终音频合并到视频中",
"Start Audio Processing": "开始音频处理",
"Audio processing is complete! You can check the audio files in the `output` folder.": "音频处理完成!您可以在`output`文件夹中查看音频文件。",
"Delete dubbing files": "删除配音文件",
"Generate audio tasks": "生成音频任务",
"Extract refer audio": "提取参考音频",
"Generate all audio": "生成所有音频",
"Merge full audio": "合并完整音频",
"Merge dubbing to the video": "将配音合并到视频中",
"Audio processing complete! 🎇": "音频处理完成! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "欢迎来到VideoLingo。如果遇到任何问题,随时可以通过我们的免费问答助手 here 获取即时解答!还可以免费试用我们的SaaS网站 videolingo.io!",
"WhisperX Runtime": "WhisperX 运行环境",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "本地运行需要>8GB显存GPU,云端运行需要302ai API密钥,elevenlabs运行需要ElevenLabs API密钥",
"WhisperX 302ai API": "WhisperX 302ai API密钥",
"=====NOTE2=====": "以下是install.py中的内容",
"🚀 Starting Installation": "🚀 开始安装",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "是否需要自动配置PyPI镜像?(如果访问pypi.org困难,建议使用)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 检测到NVIDIA GPU,正在安装CUDA版本的PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 检测到MacOS,正在安装CPU版本的PyTorch... 注意:在whisperX转录过程中可能会较慢。",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 未检测到NVIDIA GPU,正在安装CPU版本的PyTorch... 注意:在whisperX转录过程中可能会较慢。",
"❌ Failed to install requirements:": "❌ 安装依赖失败:",
"✅ FFmpeg is already installed": "✅ FFmpeg已安装",
"❌ FFmpeg not found\n\n": "❌ 未找到FFmpeg\n\n",
"🛠️ Install using:": "🛠️ 使用以下命令安装:",
"💡 Note:": "💡 注意:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 安装FFmpeg后,请重新运行此安装程序:",
"Install Chocolatey first (https://chocolatey.org/)": "请先安装Chocolatey (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "请先安装Homebrew (https://brew.sh/)",
"Use your distribution's package manager": "使用您的发行版包管理器",
"FFmpeg is required. Please install it and run the installer again.": "需要安装FFmpeg。请安装后重新运行安装程序。",
"Installation completed": "安装完成",
"Now I will run this command to start the application:": "现在我将运行以下命令启动应用:",
"Note: First startup may take up to 1 minute": "注意:首次启动可能需要最多1分钟",
"If the application fails to start:": "如果应用启动失败:",
"Check your network connection": "检查网络连接",
"Re-run the installer: [bold]python install.py[/bold]": "重新运行安装程序:[bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "正在使用 `pip install -r requirements.txt` 安装依赖",
"Detected NVIDIA GPU(s)": "检测到NVIDIA GPU",
"No NVIDIA GPU detected": "未检测到NVIDIA GPU",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "未检测到NVIDIA GPU或NVIDIA驱动未正确安装",
"LLM JSON Format Support": "LLM JSON格式支持",
"Enable if your LLM supports JSON mode output": "如果选用的LLM支持JSON模式输出,请启用"
}
================================================
FILE: translations/zh-HK.json
================================================
{
"a. Download or Upload Video": "a. 下載或上傳影片",
"Delete and Reselect": "刪除並重新選擇",
"Enter YouTube link:": "輸入YouTube連結:",
"Resolution": "解析度",
"Download Video": "下載影片",
"Or upload video": "或上傳影片",
"Youtube Settings": "Youtube設定",
"Cookies Path": "Cookies文件路徑",
"LLM Configuration": "LLM設定",
"API_KEY": "API金鑰",
"BASE_URL": "BASE_URL",
"MODEL": "模型",
"Openai format, will add /v1/chat/completions automatically": "OpenAI格式,將自動添加/v1/chat/completions",
"click to check API validity": "點擊檢查API有效性",
"API Key is valid": "API金鑰有效",
"API Key is invalid": "API金鑰無效",
"Recog Lang": "識別語言",
"Subtitles Settings": "字幕設定",
"Target Lang": "目標語言",
"Input any language in natural language, as long as llm can understand": "用自然語言輸入任何語言,只要LLM能理解即可",
"Vocal separation enhance": "人聲分離增強",
"Burn-in Subtitles": "燒錄字幕",
"Whether to burn subtitles into the video, will increase processing time": "是否將字幕燒錄到影片中,會增加處理時間",
"Video Resolution": "影片解析度",
"Recommended for videos with loud background noise, but will increase processing time": "建議用於背景噪音較大的影片,但會增加處理時間",
"Dubbing Settings": "配音設定",
"TTS Method": "TTS方法",
"SiliconFlow API Key": "SiliconFlow API金鑰",
"Mode Selection": "模式選擇",
"Preset": "預設",
"Refer_stable": "穩定參考",
"Refer_dynamic": "動態參考",
"OpenAI Voice": "OpenAI語音",
"Fish TTS Character": "Fish TTS角色",
"Azure Voice": "Azure語音",
"Please refer to Github homepage for GPT_SoVITS configuration": "請參考Github主頁了解GPT_SoVITS設定",
"SoVITS Character": "SoVITS角色",
"Refer Mode": "參考模式",
"Mode 1: Use provided reference audio only": "模式1:僅使用提供的參考音頻",
"Mode 2: Use first audio from video as reference": "模式2:使用影片中的第一段音頻作為參考",
"Mode 3: Use each audio from video as reference": "模式3:使用影片中的每段音頻作為參考",
"Configure reference audio mode for GPT-SoVITS": "配置GPT-SoVITS的參考音頻模式",
"Edge TTS Voice": "Edge TTS語音",
"=====NOTE=====": "以下是st.py中的內容",
"b. Translate and Generate Subtitles": "b. 翻譯並生成字幕",
"This stage includes the following steps:": "此階段包含以下步驟:",
"WhisperX word-level transcription": "WhisperX詞級轉錄",
"Sentence segmentation using NLP and LLM": "使用NLP和LLM進行句子分段",
"Summarization and multi-step translation": "摘要和多步翻譯",
"Cutting and aligning long subtitles": "切割和對齊長字幕",
"Generating timeline and subtitles": "生成時間軸和字幕",
"Merging subtitles into the video": "將字幕合併到影片中",
"Start Processing Subtitles": "開始處理字幕",
"Download All Srt Files": "下載所有Srt檔案",
"Archive to 'history'": "歸檔到'history'",
"Using Whisper for transcription...": "正在使用Whisper進行轉錄...",
"Splitting long sentences...": "正在分割長句...",
"Summarizing and translating...": "正在總結和翻譯...",
"Processing and aligning subtitles...": "正在處理和對齊字幕...",
"Merging subtitles to video...": "正在將字幕合併到影片中...",
"⚠️ PAUSE_BEFORE_TRANSLATE. Go to `output/log/terminology.json` to edit terminology. Then press ENTER to continue...": "⚠️ 翻譯前暫停。請前往`output/log/terminology.json`編輯術語。然後按Enter鍵繼續...",
"Subtitle processing complete! 🎉": "字幕處理完成! 🎉",
"c. Dubbing": "c. 配音",
"Generate audio tasks and chunks": "生成音頻任務和分塊",
"Extract reference audio": "提取參考音頻",
"Generate and merge audio files": "生成和合併音頻檔案",
"Merge final audio into video": "將最終音頻合併到影片中",
"Start Audio Processing": "開始音頻處理",
"Audio processing is complete! You can check the audio files in the `output` folder.": "音頻處理完成!您可以在`output`資料夾中查看音頻檔案。",
"Delete dubbing files": "刪除配音檔案",
"Generate audio tasks": "生成音頻任務",
"Extract refer audio": "提取參考音頻",
"Generate all audio": "生成所有音頻",
"Merge full audio": "合併完整音頻",
"Merge dubbing to the video": "將配音合併到影片中",
"Audio processing complete! 🎇": "音頻處理完成! 🎇",
"Hello, welcome to VideoLingo. If you encounter any issues, feel free to get instant answers with our Free QA Agent here! You can also try out our SaaS website at videolingo.io for free!": "歡迎來到VideoLingo。如果遇到任何問題,隨時可以透過我們的免費問答助手 here 獲取即時解答!還可以免費試用我們的SaaS網站 videolingo.io!",
"WhisperX Runtime": "WhisperX 運行環境",
"Local runtime requires >8GB GPU, cloud runtime requires 302ai API key, elevenlabs runtime requires ElevenLabs API key": "本地運行需要>8GB顯存GPU,雲端運行需要302ai API金鑰,elevenlabs運行需要ElevenLabs API金鑰",
"WhisperX 302ai API": "WhisperX 302ai API金鑰",
"=====NOTE2=====": "以下是install.py中的內容",
"🚀 Starting Installation": "🚀 開始安裝",
"Do you need to auto-configure PyPI mirrors? (Recommended if you have difficulty accessing pypi.org)": "是否需要自動配置PyPI鏡像?(如果訪問pypi.org困難,建議使用)",
"🎮 NVIDIA GPU detected, installing CUDA version of PyTorch...": "🎮 檢測到NVIDIA GPU,正在安裝CUDA版本的PyTorch...",
"🍎 MacOS detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "🍎 檢測到MacOS,正在安裝CPU版本的PyTorch... 注意:在whisperX轉錄過程中可能會較慢。",
"💻 No NVIDIA GPU detected, installing CPU version of PyTorch... Note: it might be slow during whisperX transcription.": "💻 未檢測到NVIDIA GPU,正在安裝CPU版本的PyTorch... 注意:在whisperX轉錄過程中可能會較慢。",
"❌ Failed to install requirements:": "❌ 安裝依賴失敗:",
"✅ FFmpeg is already installed": "✅ FFmpeg已安裝",
"❌ FFmpeg not found\n\n": "❌ 未找到FFmpeg\n\n",
"🛠️ Install using:": "🛠️ 使用以下命令安裝:",
"💡 Note:": "💡 注意:",
"🔄 After installing FFmpeg, please run this installer again:": "🔄 安裝FFmpeg後,請重新運行此安裝程序:",
"Install Chocolatey first (https://chocolatey.org/)": "請先安裝Chocolatey (https://chocolatey.org/)",
"Install Homebrew first (https://brew.sh/)": "請先安裝Homebrew (https://brew.sh/)",
"Use your distribution's package manager": "使用您的發行版套件管理器",
"FFmpeg is required. Please install it and run the installer again.": "需要安裝FFmpeg。請安裝後重新運行安裝程序。",
"Installation completed": "安裝完成",
"Now I will run this command to start the application:": "現在我將運行以下命令啟動應用:",
"Note: First startup may take up to 1 minute": "注意:首次啟動可能需要最多1分鐘",
"If the application fails to start:": "如果應用啟動失敗:",
"Check your network connection": "檢查網絡連接",
"Re-run the installer: [bold]python install.py[/bold]": "重新運行安裝程序:[bold]python install.py[/bold]",
"Installing requirements using `pip install -r requirements.txt`": "正在使用 `pip install -r requirements.txt` 安裝依賴",
"Detected NVIDIA GPU(s)": "檢測到NVIDIA GPU",
"No NVIDIA GPU detected": "未檢測到NVIDIA GPU",
"No NVIDIA GPU detected or NVIDIA drivers not properly installed": "未檢測到NVIDIA GPU或NVIDIA驅動未正確安裝",
"LLM JSON Format Support": "LLM JSON格式支持",
"Enable if your LLM supports JSON mode output": "如果選用的LLM支持JSON模式輸出,請啟用"
}