Repository: linyqh/NarratoAI Branch: main Commit: a6f2e0d815c4 Files: 118 Total size: 844.1 KB Directory structure: gitextract_1wpinv12/ ├── .dockerignore ├── .github/ │ ├── pull_request_template.md │ ├── release-drafter.yml │ └── workflows/ │ ├── auto-release-generator.yml │ ├── codeReview.yml │ └── discord-release-notification.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README-en.md ├── README.md ├── app/ │ ├── __init__.py │ ├── config/ │ │ ├── __init__.py │ │ ├── audio_config.py │ │ ├── config.py │ │ └── ffmpeg_config.py │ ├── models/ │ │ ├── __init__.py │ │ ├── const.py │ │ ├── exception.py │ │ └── schema.py │ ├── services/ │ │ ├── SDE/ │ │ │ └── short_drama_explanation.py │ │ ├── SDP/ │ │ │ ├── generate_script_short.py │ │ │ └── utils/ │ │ │ ├── short_schema.py │ │ │ ├── step1_subtitle_analyzer_openai.py │ │ │ ├── step5_merge_script.py │ │ │ └── utils.py │ │ ├── __init__.py │ │ ├── audio_merger.py │ │ ├── audio_normalizer.py │ │ ├── clip_video.py │ │ ├── generate_narration_script.py │ │ ├── generate_video.py │ │ ├── llm/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── config_validator.py │ │ │ ├── exceptions.py │ │ │ ├── litellm_provider.py │ │ │ ├── manager.py │ │ │ ├── migration_adapter.py │ │ │ ├── providers/ │ │ │ │ └── __init__.py │ │ │ ├── test_litellm_integration.py │ │ │ ├── test_llm_service.py │ │ │ ├── unified_service.py │ │ │ └── validators.py │ │ ├── llm.py │ │ ├── material.py │ │ ├── merger_video.py │ │ ├── prompts/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── documentary/ │ │ │ │ ├── __init__.py │ │ │ │ ├── frame_analysis.py │ │ │ │ └── narration_generation.py │ │ │ ├── exceptions.py │ │ │ ├── manager.py │ │ │ ├── registry.py │ │ │ ├── short_drama_editing/ │ │ │ │ ├── __init__.py │ │ │ │ ├── plot_extraction.py │ │ │ │ └── subtitle_analysis.py │ │ │ ├── short_drama_narration/ │ │ │ │ ├── __init__.py │ │ │ │ ├── plot_analysis.py │ │ │ │ └── script_generation.py │ │ │ ├── template.py │ │ │ └── validators.py │ │ ├── script_service.py │ │ ├── state.py │ │ ├── subtitle.py │ │ ├── subtitle_merger.py │ │ ├── subtitle_text.py │ │ ├── task.py │ │ ├── update_script.py │ │ ├── upload_validation.py │ │ ├── video.py │ │ ├── video_service.py │ │ ├── voice.py │ │ └── youtube_service.py │ └── utils/ │ ├── check_script.py │ ├── ffmpeg_utils.py │ ├── gemini_analyzer.py │ ├── gemini_openai_analyzer.py │ ├── qwenvl_analyzer.py │ ├── script_generator.py │ ├── utils.py │ └── video_processor.py ├── config.example.toml ├── docker-compose.yml ├── docker-deploy.sh ├── docker-entrypoint.sh ├── docs/ │ └── voice-list.txt ├── project_version ├── requirements.txt ├── resource/ │ ├── fonts/ │ │ └── fonts_in_here.txt │ ├── public/ │ │ └── index.html │ ├── scripts/ │ │ └── script_in_here.txt │ ├── songs/ │ │ └── song_in_here.txt │ ├── srt/ │ │ └── srt_in_here.txt │ └── videos/ │ └── video_in_here.txt ├── webui/ │ ├── __init__.py │ ├── components/ │ │ ├── __init__.py │ │ ├── audio_settings.py │ │ ├── basic_settings.py │ │ ├── ffmpeg_diagnostics.py │ │ ├── script_settings.py │ │ ├── subtitle_settings.py │ │ ├── system_settings.py │ │ └── video_settings.py │ ├── config/ │ │ └── settings.py │ ├── i18n/ │ │ ├── __init__.py │ │ ├── en.json │ │ └── zh.json │ ├── tools/ │ │ ├── base.py │ │ ├── generate_script_docu.py │ │ ├── generate_script_short.py │ │ └── generate_short_summary.py │ └── utils/ │ ├── cache.py │ ├── file_utils.py │ └── vision_analyzer.py └── webui.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # Git 相关 .git/ .gitignore .gitattributes .svn/ # Python 相关 __pycache__/ *.py[cod] *$py.class *.so .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # 虚拟环境 .env .env.* .venv venv/ ENV/ env.bak/ venv.bak/ # IDE 相关 .vscode/ .idea/ *.swp *.swo *~ # 操作系统相关 .DS_Store .DS_Store? ._* .Spotlight-V100 .Trashes ehthumbs.db Thumbs.db # 日志和数据库文件 *.log *.db logs/ # 临时文件 *.tmp *.temp temp/ tmp/ # 存储目录(运行时生成的内容) storage/temp/ storage/tasks/ storage/demo.py # 缓存目录 .cache/ .pytest_cache/ # 文档(保留必要的) docs/ *.md !README.md # Docker 相关文件(避免递归复制) Dockerfile.* docker-compose.*.yml # 配置文件(使用示例配置) config.toml # 资源文件中的大文件 resource/videos/ resource/songs/ # 测试文件 tests/ test_* *_test.py # 其他不必要的文件 *.bak *.orig *.rej ================================================ FILE: .github/pull_request_template.md ================================================ ## PR 类型 请选择一个适当的标签(必选其一): - [ ] 破坏性变更 (breaking) - [ ] 安全修复 (security) - [ ] 新功能 (feature) - [ ] Bug修复 (bug) - [ ] 代码重构 (refactor) - [ ] 依赖升级 (upgrade) - [ ] 文档更新 (docs) - [ ] 翻译相关 (lang-all) - [ ] 内部改进 (internal) ## 描述 ## 相关 Issue ## 更改内容 - xxx - xxx - xxx ## 测试 - [ ] 单元测试 - [ ] 集成测试 - [ ] 手动测试 ## 截图(如果适用) ## 检查清单 - [ ] 我的代码遵循项目的代码风格 - [ ] 我已经添加了必要的测试 - [ ] 我已经更新了相关文档 - [ ] 我的更改不会引入新的警告 - [ ] PR 标题清晰描述了更改内容 ## 补充说明 ================================================ FILE: .github/release-drafter.yml ================================================ name-template: 'v$RESOLVED_VERSION' tag-template: 'v$RESOLVED_VERSION' categories: - title: '🚀 新功能' labels: - 'feature' - 'enhancement' - title: '🐛 Bug 修复' labels: - 'fix' - 'bug' - title: '🧰 维护' labels: - 'chore' - 'maintenance' - title: '📚 文档' labels: - 'docs' - 'documentation' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' version-resolver: major: labels: - 'major' - 'breaking' minor: labels: - 'minor' - 'feature' patch: labels: - 'patch' - 'fix' - 'bug' - 'maintenance' default: patch template: | ## 更新内容 $CHANGES ## 贡献者 $CONTRIBUTORS ================================================ FILE: .github/workflows/auto-release-generator.yml ================================================ name: Auto Release Generator on: push: branches: - main paths: - 'project_version' # 确保路径准确,不使用通配符 jobs: check-version-and-release: runs-on: ubuntu-latest permissions: contents: write # 用于创建 releases pull-requests: write # 可能需要的额外权限 steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 # 获取完整历史以检查变更 - name: Debug Environment run: | echo "工作目录内容:" ls -la echo "project_version 文件内容:" cat project_version || echo "文件不存在" - name: Check if version changed id: check-version run: | # 获取当前版本号 if [ -f "project_version" ]; then CURRENT_VERSION=$(cat project_version) echo "Current version: $CURRENT_VERSION" # 获取上一个提交中的版本号 git fetch origin main if git show HEAD~1:project_version &>/dev/null; then PREVIOUS_VERSION=$(git show HEAD~1:project_version) echo "Previous version from commit: $PREVIOUS_VERSION" if [[ "$CURRENT_VERSION" != "$PREVIOUS_VERSION" ]]; then echo "Version changed from $PREVIOUS_VERSION to $CURRENT_VERSION" echo "version_changed=true" >> $GITHUB_OUTPUT echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT else echo "Version unchanged" echo "version_changed=false" >> $GITHUB_OUTPUT fi else echo "Cannot find previous version, assuming first release" echo "version_changed=true" >> $GITHUB_OUTPUT echo "current_version=$CURRENT_VERSION" >> $GITHUB_OUTPUT fi else echo "project_version file not found" echo "version_changed=false" >> $GITHUB_OUTPUT fi - name: Set up Python if: steps.check-version.outputs.version_changed == 'true' uses: actions/setup-python@v4 with: python-version: '3.10' - name: Install OpenAI SDK if: steps.check-version.outputs.version_changed == 'true' run: pip install openai - name: Get commits since last release if: steps.check-version.outputs.version_changed == 'true' id: get-commits run: | # 直接获取最近10个提交 echo "Getting last 13 commits" COMMITS=$(git log -13 --pretty=format:"%s") echo "Commits to be included in release notes:" echo "$COMMITS" echo "commits<> $GITHUB_OUTPUT echo "$COMMITS" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT - name: Generate release notes with AI if: steps.check-version.outputs.version_changed == 'true' id: generate-notes env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_BASE_URL: https://api.siliconflow.cn/v1 CURRENT_VERSION: ${{ steps.check-version.outputs.current_version }} run: | cat > generate_release_notes.py << 'EOF' import os import sys from openai import OpenAI # 设置OpenAI客户端 client = OpenAI( api_key=os.environ.get("OPENAI_API_KEY"), base_url=os.environ.get("OPENAI_BASE_URL") ) # 获取提交信息和版本号 commits = sys.stdin.read() version = os.environ.get("CURRENT_VERSION") # 调用API生成发布说明 try: response = client.chat.completions.create( model="deepseek-ai/DeepSeek-V3", messages=[ {"role": "system", "content": "你是一个专业的软件发布说明生成助手。请根据提供的git提交信息,生成一个结构化的发布说明,包括新功能、改进、修复的bug等类别。使用中文回复。"}, {"role": "user", "content": f"请根据以下git提交信息,生成一个版本{version}的发布说明,内容详细且完整,相似的提交信息不要重复出现: \n\n{commits}"} ], temperature=0.7, ) release_notes = response.choices[0].message.content print(f"commits: \n{commits}") print(f"大模型总结的发布说明: \n{release_notes}") except Exception as e: print(f"Error calling OpenAI API: {e}") release_notes = f"# 版本 {version} 发布\n\n## 更新内容\n\n" # 简单处理提交信息 for line in commits.strip().split("\n"): if line: release_notes += f"- {line}\n" # 输出生成的发布说明 print(release_notes) # 保存到GitHub输出 with open(os.environ.get("GITHUB_OUTPUT"), "a") as f: f.write("release_notes< send_discord_notification.py << 'EOF' import os import sys import json from openai import OpenAI import requests from datetime import datetime from discord_webhook import DiscordWebhook, DiscordEmbed # 设置OpenAI客户端 client = OpenAI( api_key=os.environ.get("OPENAI_API_KEY"), base_url=os.environ.get("OPENAI_BASE_URL") ) # 获取GitHub release信息 github_token = os.environ.get("GITHUB_TOKEN") repo = os.environ.get("GITHUB_REPOSITORY") # 直接从GitHub API获取最新release headers = {"Authorization": f"token {github_token}"} response = requests.get(f"https://api.github.com/repos/{repo}/releases/latest", headers=headers) if response.status_code != 200: print(f"Error fetching release info: {response.status_code}") print(response.text) sys.exit(1) release_info = response.json() # 提取需要的信息 release_notes = release_info.get("body", "无发布说明") version = release_info.get("tag_name", "未知版本") # 安全地解析发布日期 published_at = release_info.get("published_at") if published_at: try: release_date = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y年%m月%d日") except ValueError: release_date = "未知日期" else: release_date = "未知日期" # 使用大模型润色发布说明 try: response = client.chat.completions.create( model="deepseek-ai/DeepSeek-V3", messages=[ {"role": "system", "content": "你是一个专业的软件发布公告优化助手。请优化以下发布说明,使其更加生动、专业,并明确区分新功能、优化内容、修复内容和移除内容等类别。保持原有信息的完整性,同时增强可读性和专业性。使用中文回复。\n\n重要:Discord不支持复杂的Markdown格式,因此请使用简单的格式化:\n1. 使用**粗体**和*斜体*而不是Markdown标题\n2. 使用简单的列表符号(•)而不是Markdown列表\n3. 避免使用#、##等标题格式\n4. 不要使用表格、代码块等复杂格式\n5. 确保段落之间有空行\n6. 使用简单的分隔符(如 ------)来分隔不同部分"}, {"role": "user", "content": f"请优化以下版本{version}的发布说明,使其更适合在Discord社区发布。请记住Discord不支持复杂的Markdown格式,所以使用简单的格式化方式:\n\n{release_notes}"} ], temperature=0.7, ) enhanced_notes = response.choices[0].message.content print(f"大模型润色后的发布说明: \n{enhanced_notes}") except Exception as e: print(f"Error calling OpenAI API: {e}") enhanced_notes = release_notes # 如果API调用失败,使用原始发布说明 # 创建Discord消息 webhook_url = os.environ.get("DISCORD_WEBHOOK_URL") if not webhook_url: print("Error: DISCORD_WEBHOOK_URL not set") sys.exit(1) webhook = DiscordWebhook(url=webhook_url) # 创建嵌入式消息 embed = DiscordEmbed( title=f"🚀 NarratoAI {version} 发布公告", description=f"发布日期: {release_date}", color="5865F2" # Discord蓝色 ) # 处理发布说明,确保不超过Discord的字段限制 # Discord字段值限制为1024个字符 MAX_FIELD_LENGTH = 1024 # 如果内容很短,直接添加 if enhanced_notes and len(enhanced_notes) <= MAX_FIELD_LENGTH: embed.add_embed_field(name="📋 更新内容", value=enhanced_notes) elif enhanced_notes: # 尝试按段落或明显的分隔符分割内容 sections = [] # 检查是否有明显的新功能、优化、修复等部分 if "**新增功能**" in enhanced_notes or "**新功能**" in enhanced_notes: parts = enhanced_notes.split("**新增功能**", 1) if len(parts) > 1: intro = parts[0].strip() if intro: sections.append(("📋 更新概述", intro)) rest = "**新增功能**" + parts[1] # 进一步分割剩余部分 feature_end = -1 for marker in ["**优化内容**", "**性能优化**", "**修复内容**", "**bug修复**", "**问题修复**"]: pos = rest.lower().find(marker.lower()) if pos != -1 and (feature_end == -1 or pos < feature_end): feature_end = pos if feature_end != -1: sections.append(("✨ 新增功能", rest[:feature_end].strip())) rest = rest[feature_end:] else: sections.append(("✨ 新增功能", rest.strip())) rest = "" # 继续分割剩余部分 if rest: optimize_end = -1 for marker in ["**修复内容**", "**bug修复**", "**问题修复**"]: pos = rest.lower().find(marker.lower()) if pos != -1 and (optimize_end == -1 or pos < optimize_end): optimize_end = pos if optimize_end != -1: sections.append(("⚡ 优化内容", rest[:optimize_end].strip())) sections.append(("🔧 修复内容", rest[optimize_end:].strip())) else: sections.append(("⚡ 优化内容", rest.strip())) else: # 如果没有明显的结构,按长度分割 chunks = [enhanced_notes[i:i+MAX_FIELD_LENGTH] for i in range(0, len(enhanced_notes), MAX_FIELD_LENGTH)] for i, chunk in enumerate(chunks): if i == 0: sections.append(("📋 更新内容", chunk)) else: sections.append((f"📋 更新内容(续{i})", chunk)) # 添加所有部分到embed for name, content in sections: if len(content) > MAX_FIELD_LENGTH: # 如果单个部分仍然过长,进一步分割 sub_chunks = [content[i:i+MAX_FIELD_LENGTH] for i in range(0, len(content), MAX_FIELD_LENGTH)] for i, chunk in enumerate(sub_chunks): if i == 0: embed.add_embed_field(name=name, value=chunk) else: embed.add_embed_field(name=f"{name}(续{i})", value=chunk) else: embed.add_embed_field(name=name, value=content) else: embed.add_embed_field(name="📋 更新内容", value="无详细更新内容") # 添加下载链接 html_url = release_info.get("html_url", "") if html_url: embed.add_embed_field(name="📥 下载链接", value=html_url, inline=False) # 设置页脚 embed.set_footer(text=f"NarratoAI 团队 • {release_date}") embed.set_timestamp() # 添加嵌入式消息到webhook webhook.add_embed(embed) # 发送消息 response = webhook.execute() if response: print(f"Discord notification sent with status code: {response.status_code}") else: print("Failed to send Discord notification") EOF # 执行脚本 python send_discord_notification.py ================================================ FILE: .gitignore ================================================ .DS_Store /config.toml /storage/ /.idea/ /app/services/__pycache__ /app/__pycache__/ /app/config/__pycache__/ /app/models/__pycache__/ /app/utils/__pycache__/ /*/__pycache__/* .vscode /**/.streamlit __pycache__ logs/ node_modules # VuePress 默认临时文件目录 /sites/docs/.vuepress/.temp # VuePress 默认缓存目录 /sites/docs/.vuepress/.cache # VuePress 默认构建生成的静态文件目录 /sites/docs/.vuepress/dist # 模型目录 /models/ ./models/* resource/scripts/*.json resource/videos/*.mp4 resource/songs/*.mp3 resource/songs/*.flac resource/fonts/*.ttc resource/fonts/*.ttf resource/fonts/*.otf resource/srt/*.srt app/models/faster-whisper-large-v2/* app/models/faster-whisper-large-v3/* app/models/bert/* bug清单.md task.md .claude/* .serena/* # OpenSpec: 忽略活动的变更提案,但保留归档和规范 openspec/* AGENTS.md CLAUDE.md tests/* ================================================ FILE: Dockerfile ================================================ # 多阶段构建 - 构建阶段 FROM python:3.12-slim-bookworm AS builder # 设置构建参数 ARG DEBIAN_FRONTEND=noninteractive # 设置工作目录 WORKDIR /build # 安装构建依赖 RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ git \ git-lfs \ pkg-config \ && rm -rf /var/lib/apt/lists/* # 升级 pip 并创建虚拟环境 RUN python -m pip install --upgrade pip setuptools wheel && \ python -m venv /opt/venv # 激活虚拟环境 ENV PATH="/opt/venv/bin:$PATH" # 复制 requirements.txt 并使用镜像安装 Python 依赖 COPY requirements.txt . RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt # 运行阶段 FROM python:3.12-slim-bookworm # 设置运行参数 ARG DEBIAN_FRONTEND=noninteractive # 设置工作目录 WORKDIR /NarratoAI # 从构建阶段复制虚拟环境 COPY --from=builder /opt/venv /opt/venv # 设置环境变量 ENV PATH="/opt/venv/bin:$PATH" \ PYTHONPATH="/NarratoAI" \ PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PYTHONIOENCODING=utf-8 \ LANG=C.UTF-8 \ LC_ALL=C.UTF-8 # 一次性安装所有依赖、创建用户、配置系统,减少层级 RUN apt-get update && apt-get install -y --no-install-recommends \ imagemagick \ ffmpeg \ wget \ curl \ git-lfs \ ca-certificates \ dos2unix \ && sed -i 's/

NarratoAI 😎📽️

An all-in-one AI-powered tool for film commentary and automated video editing.🎬🎞️

📖 English | 简体中文 | 日本語

[//]: # ( harry0703%2FNarratoAI | Trendshift)

NarratoAI is an automated video narration tool that provides an all-in-one solution for script writing, automated video editing, voice-over, and subtitle generation, powered by LLM to enhance efficient content creation.
> **🔥 Highly Recommended: The new paradigm of VibeCut —— [Speclip](https://speclip.com) , a true editing Agent! [👉 Click to download for free](https://speclip.com)** [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI) [![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE) [![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues) [![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers) 💬 Join the open source community to get project updates and the latest news.

🎉🎉🎉 Official Documentation 🎉🎉🎉

Home

![](docs/index-en.png)

Video Review Interface

![](docs/check-en.png) ## Latest News - 2025.05.11 Released new version 0.6.0, supports **short drama commentary** and optimized editing process - 2025.03.06 Released new version 0.5.2, supports DeepSeek R1 and DeepSeek V3 models for short drama mixing - 2024.12.16 Released new version 0.3.9, supports Alibaba Qwen2-VL model for video understanding; supports short drama mixing - 2024.11.24 Opened Discord community: https://discord.com/invite/V2pbAqqQNb - 2024.11.11 Migrated open source community, welcome to join! [Join the official community](https://github.com/linyqh/NarratoAI/wiki) - 2024.11.10 Released official documentation, details refer to [Official Documentation](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg) - 2024.11.10 Released new version v0.3.5; optimized video editing process, ## Major Benefits 🎉 From now on, fully support DeepSeek model! Register to enjoy 20 million free tokens (worth 14 yuan platform quota), editing a 10-minute video only costs 0.1 yuan! 🔥 Quick benefits: 1️⃣ Click the link to register: https://cloud.siliconflow.cn/i/pyOKqFCV 2️⃣ Log in with your phone number, **be sure to fill in the invitation code: pyOKqFCV** 3️⃣ Receive a 14 yuan quota, experience high cost-effective AI editing quickly! 💡 Low cost, high creativity: Silicon Flow API Key can be integrated with one click, doubling intelligent editing efficiency! (Note: The invitation code is the only proof for benefit collection, automatically credited after registration) Immediately take action to unlock your AI productivity with "pyOKqFCV"! 😊 Update Steps: Integration Package: Click update.bat one-click update script Code Build: Use git pull to fetch the latest code ## Announcement 📢 _**Note⚠️: Recently, someone has been impersonating the author on x (Twitter) to issue tokens on the pump.fun platform! This is a scam!!! Do not be deceived! Currently, NarratoAI has not made any official promotions on x (Twitter), please be cautious**_ Below is a screenshot of this person's x (Twitter) homepage Screenshot_20250109_114131_Samsung Internet ## Future Plans 🥳 - [x] Windows Integration Pack Release - [x] Optimized the story generation process and improved the generation effect - [x] Released version 0.3.5 integration package - [x] Support Alibaba Qwen2-VL large model for video understanding - [x] Support short drama commentary - [x] One-click merge materials - [x] One-click transcription - [x] One-click clear cache - [ ] Support exporting to Jianying drafts - [X] Support short drama commentary - [ ] Character face matching - [ ] Support automatic matching based on voiceover, script, and video materials - [ ] Support more TTS engines - [ ] ... ## System Requirements 📦 - Recommended minimum: CPU with 4 cores or more, 8GB RAM or more, GPU is not required - Windows 10/11 or MacOS 11.0 or above - [Python 3.12+](https://www.python.org/downloads/) ## Feedback & Suggestions 📢 👏 1. You can submit [issue](https://github.com/linyqh/NarratoAI/issues) or [pull request](https://github.com/linyqh/NarratoAI/pulls) 💬 2. [Join the open source community exchange group](https://github.com/linyqh/NarratoAI/wiki) 📷 3. Follow the official account [NarratoAI助手] to grasp the latest news ## Reference Projects 📚 - https://github.com/FujiwaraChoki/MoneyPrinter - https://github.com/harry0703/MoneyPrinterTurbo This project was refactored based on the above projects with the addition of video narration features. Thanks to the original authors for their open-source spirit 🥳🥳🥳 ## Buy the Author a Cup of Coffee ☕️
Image 1 Image 2
## License 📝 Click to view [`LICENSE`](LICENSE) file ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date) ================================================ FILE: README.md ================================================

NarratoAI 😎📽️

一站式 AI 影视解说+自动化剪辑工具🎬🎞️

📖 English | 简体中文

[//]: # ( harry0703%2FNarratoAI | Trendshift)

NarratoAI 是一个自动化影视解说工具,基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程,助力高效内容创作。
> **🔥 隆重推荐:VibeCut 的新范式 —— [Speclip](https://speclip.com) ,一个真正意义上的剪辑 Agent![👉 点击免费下载](https://speclip.com)** [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/linyqh/NarratoAI) [![GitHub license](https://img.shields.io/github/license/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/blob/main/LICENSE) [![GitHub issues](https://img.shields.io/github/issues/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/issues) [![GitHub stars](https://img.shields.io/github/stars/linyqh/NarratoAI?style=for-the-badge)](https://github.com/linyqh/NarratoAI/stargazers) 💬 加入 discord 开源社区,获取项目动态和最新资讯。

🎉🎉🎉 官方文档 🎉🎉🎉

首页

![](docs/index-zh.png)
## 许可证 本项目仅供学习和研究使用,不得商用。如需商业授权,请联系作者。 ## 最新资讯 - 2025.11.20 发布新版本 0.7.5, 新增 [IndexTTS2](https://github.com/index-tts/index-tts) 语音克隆支持 - 2025.10.15 发布新版本 0.7.3, 使用 [LiteLLM](https://github.com/BerriAI/litellm) 管理模型供应商 - 2025.09.10 发布新版本 0.7.2, 新增腾讯云tts - 2025.08.18 发布新版本 0.7.1,支持 **语音克隆** 和 最新大模型 - 2025.05.11 发布新版本 0.6.0,支持 **短剧解说** 和 优化剪辑流程 - 2025.03.06 发布新版本 0.5.2,支持 DeepSeek R1 和 DeepSeek V3 模型进行短剧混剪 - 2024.12.16 发布新版本 0.3.9,支持阿里 Qwen2-VL 模型理解视频;支持短剧混剪 - 2024.11.24 开通 discord 社群:https://discord.com/invite/V2pbAqqQNb - 2024.11.11 迁移开源社群,欢迎加入! [加入官方社群](https://github.com/linyqh/NarratoAI/wiki) - 2024.11.10 发布官方文档,详情参见 [官方文档](https://p9mf6rjv3c.feishu.cn/wiki/SP8swLLZki5WRWkhuFvc2CyInDg) - 2024.11.10 发布新版本 v0.3.5;优化视频剪辑流程, ## 重磅福利 🎉 > 1️⃣ > **开发者专属福利:一站式AI平台,注册即送体验金!** > > 还在为接入各种AI模型烦恼吗?向您推荐 302.AI,一个企业级的AI资源中心。一次接入,即可调用上百种AI模型,涵盖语言、图像、音视频等,按量付费,极大降低开发成本。 > > 通过下方我的专属链接注册,**立获1美元免费体验金**,助您轻松开启AI开发之旅。 > > **立即注册领取:** [https://share.302.ai/I9P6mP](https://share.302.ai/I9P6mP) --- > 2️⃣ > 即日起全面支持硅基流动!注册即享2000万免费Token(价值16元平台配额),剪辑10分钟视频仅需0.1元! > > 🔥 快速领福利: > 1️⃣ 点击链接注册:https://cloud.siliconflow.cn/i/MI9PgHwB > 2️⃣ 使用手机号登录,**务必填写邀请码:MI9PgHwB** > 3️⃣ 领取16元配额,极速体验高性价比AI剪辑 > > 💡 小成本大创作: > 硅基流动API Key一键接入,智能剪辑效率翻倍! > (注:邀请码为福利领取唯一凭证,注册后自动到账) > > 立即行动,用「MI9PgHwB」解锁你的AI生产力! ## ⚠️谨防被骗 📢 _**1. NarratoAI 是一款完全免费的软件,近期在社交媒体(抖音,B站等)上发现,有人将 NarratoAI 改名后售卖,下面是部分截图,请大家务必提高警惕,切勿上当受骗**_ ---
诈骗截图 1 诈骗截图 2 诈骗截图 3 诈骗截图 4
--- ## 未来计划 🥳 - [x] windows 整合包发布 - [x] 优化剧情生成流程,提升生成效果 - [x] 发布 0.3.5 整合包 - [x] 支持阿里 Qwen2-VL 大模型理解视频 - [x] 支持短剧混剪 - [x] 一键合并素材 - [x] 一键转录 - [x] 一键清理缓存 - [ ] 支持导出剪映草稿 - [X] 支持短剧解说 - [ ] 主角人脸匹配 - [ ] 支持根据口播,文案,视频素材自动匹配 - [ ] 支持更多 TTS 引擎 - [ ] ... ## 快速启动 🚀 ### 方式一:macos Docker 部署(macos 推荐) ```bash # 1. 克隆项目 git clone https://github.com/linyqh/NarratoAI.git cd NarratoAI # 2. 一键部署 docker compose up -d # 3. 访问应用 # 浏览器打开 http://localhost:8501 ``` ### 方式二:整合包(Windows 推荐) > *关注微信公众号 **NarratoAI 助手** 右下角菜单栏获取下载链接* ### 方式三:本地运行 ```bash # 1. 克隆项目 git clone https://github.com/linyqh/NarratoAI.git cd NarratoAI # 2. 安装依赖 pip install -r requirements.txt # 3. 复制配置文件 cp config.example.toml config.toml # 4. 编辑 config.toml,配置你的 API 密钥 # 5. 启动应用 streamlit run webui.py --server.maxUploadSize=2048 # 6. 访问应用 # 浏览器打开 http://localhost:8501 ``` ## 配置要求 📦 - 建议最低 CPU 4核或以上,内存 8G 或以上,显卡非必须 - Windows 10/11 或 MacOS 11.0 以上系统 - [Python 3.12+](https://www.python.org/downloads/) ## 反馈建议 📢 👏 1. 可以提交 [issue](https://github.com/linyqh/NarratoAI/issues)或者 [pull request](https://github.com/linyqh/NarratoAI/pulls) 💬 2. [加入开源社区交流群](https://github.com/linyqh/NarratoAI/wiki) 📷 3. 关注公众号【NarratoAI助手】,掌握最新资讯 ## 参考项目 📚 - https://github.com/FujiwaraChoki/MoneyPrinter - https://github.com/harry0703/MoneyPrinterTurbo 该项目基于以上项目重构而来,增加了影视解说功能,感谢大佬的开源精神 🥳🥳🥳 ## 请作者喝一杯咖啡 ☕️
Image 1 Image 2
## 赞助 [![Powered by DartNode](https://dartnode.com/branding/DN-Open-Source-sm.png)](https://dartnode.com "Powered by DartNode - Free VPS for Open Source") ## 许可证 📝 点击查看 [`LICENSE`](LICENSE) 文件 ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=linyqh/NarratoAI&type=Date)](https://star-history.com/#linyqh/NarratoAI&Date) ================================================ FILE: app/__init__.py ================================================ ================================================ FILE: app/config/__init__.py ================================================ import os import sys from loguru import logger from app.config import config from app.utils import utils def __init_logger(): # _log_file = utils.storage_dir("logs/server.log") _lvl = config.log_level root_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__))) ) def format_record(record): # 获取日志记录中的文件全路径 file_path = record["file"].path # 将绝对路径转换为相对于项目根目录的路径 relative_path = os.path.relpath(file_path, root_dir) # 更新记录中的文件路径 record["file"].path = f"./{relative_path}" # 返回修改后的格式字符串 # 您可以根据需要调整这里的格式 _format = ( "{time:%Y-%m-%d %H:%M:%S} | " + "{level} | " + '"{file.path}:{line}": {function} ' + "- {message}" + "\n" ) return _format def log_filter(record): """过滤不必要的日志消息""" # 过滤掉模板注册等 DEBUG 级别的噪音日志 ignore_patterns = [ "已注册模板过滤器", "已注册提示词", "注册视觉模型提供商", "注册文本模型提供商", "LLM服务提供商注册", "FFmpeg支持的硬件加速器", "硬件加速测试优先级", "硬件加速方法", ] # 如果是 DEBUG 级别且包含过滤模式,则不显示 if record["level"].name == "DEBUG": return not any(pattern in record["message"] for pattern in ignore_patterns) return True logger.remove() logger.add( sys.stdout, level=_lvl, format=format_record, colorize=True, filter=log_filter ) # logger.add( # _log_file, # level=_lvl, # format=format_record, # rotation="00:00", # retention="3 days", # backtrace=True, # diagnose=True, # enqueue=True, # ) __init_logger() ================================================ FILE: app/config/audio_config.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : audio_config @Author : Viccy同学 @Date : 2025/1/7 @Description: 音频配置管理 ''' from typing import Dict, Any from loguru import logger class AudioConfig: """音频配置管理类""" # 默认音量配置 DEFAULT_VOLUMES = { 'tts_volume': 0.8, # TTS音量稍微降低 'original_volume': 1.3, # 原声音量提高 'bgm_volume': 0.3, # 背景音乐保持较低 } # 音频质量配置 AUDIO_QUALITY = { 'sample_rate': 44100, # 采样率 'channels': 2, # 声道数(立体声) 'bitrate': '128k', # 比特率 } # 音频处理配置 PROCESSING_CONFIG = { 'enable_smart_volume': True, # 启用智能音量调整 'enable_audio_normalization': True, # 启用音频标准化 'target_lufs': -20.0, # 目标响度 (LUFS) 'max_peak': -1.0, # 最大峰值 (dBFS) 'volume_analysis_method': 'lufs', # 音量分析方法: 'lufs' 或 'rms' } # 音频混合配置 MIXING_CONFIG = { 'crossfade_duration': 0.1, # 交叉淡化时长(秒) 'bgm_fade_out': 3.0, # BGM淡出时长(秒) 'dynamic_range_compression': False, # 动态范围压缩 } @classmethod def get_optimized_volumes(cls, video_type: str = 'default') -> Dict[str, float]: """ 根据视频类型获取优化的音量配置 Args: video_type: 视频类型 ('default', 'educational', 'entertainment', 'news') Returns: Dict[str, float]: 音量配置字典 """ base_volumes = cls.DEFAULT_VOLUMES.copy() # 根据视频类型调整音量 if video_type == 'educational': # 教育类视频:突出解说,降低原声 base_volumes.update({ 'tts_volume': 0.9, 'original_volume': 0.8, 'bgm_volume': 0.2, }) elif video_type == 'entertainment': # 娱乐类视频:平衡解说和原声 base_volumes.update({ 'tts_volume': 0.8, 'original_volume': 1.2, 'bgm_volume': 0.4, }) elif video_type == 'news': # 新闻类视频:突出解说,最小化背景音 base_volumes.update({ 'tts_volume': 1.0, 'original_volume': 0.6, 'bgm_volume': 0.1, }) logger.info(f"使用 {video_type} 类型的音量配置: {base_volumes}") return base_volumes @classmethod def get_audio_processing_config(cls) -> Dict[str, Any]: """获取音频处理配置""" return cls.PROCESSING_CONFIG.copy() @classmethod def get_mixing_config(cls) -> Dict[str, Any]: """获取音频混合配置""" return cls.MIXING_CONFIG.copy() @classmethod def validate_volume(cls, volume: float, name: str) -> float: """ 验证和限制音量值 Args: volume: 音量值 name: 音量名称(用于日志) Returns: float: 验证后的音量值 """ min_volume = 0.0 max_volume = 2.0 # 允许原声超过1.0 if volume < min_volume: logger.warning(f"{name}音量 {volume} 低于最小值 {min_volume},已调整") return min_volume elif volume > max_volume: logger.warning(f"{name}音量 {volume} 超过最大值 {max_volume},已调整") return max_volume return volume @classmethod def apply_volume_profile(cls, profile_name: str) -> Dict[str, float]: """ 应用预设的音量配置文件 Args: profile_name: 配置文件名称 Returns: Dict[str, float]: 音量配置 """ profiles = { 'balanced': { 'tts_volume': 0.8, 'original_volume': 1.2, 'bgm_volume': 0.3, }, 'voice_focused': { 'tts_volume': 1.0, 'original_volume': 0.7, 'bgm_volume': 0.2, }, 'original_focused': { 'tts_volume': 0.7, 'original_volume': 1.5, 'bgm_volume': 0.2, }, 'quiet_background': { 'tts_volume': 0.8, 'original_volume': 1.3, 'bgm_volume': 0.1, } } if profile_name in profiles: logger.info(f"应用音量配置文件: {profile_name}") return profiles[profile_name] else: logger.warning(f"未找到配置文件 {profile_name},使用默认配置") return cls.DEFAULT_VOLUMES.copy() # 全局音频配置实例 audio_config = AudioConfig() def get_recommended_volumes_for_content(content_type: str = 'mixed') -> Dict[str, float]: """ 根据内容类型推荐音量设置 Args: content_type: 内容类型 - 'mixed': 混合内容(默认) - 'voice_only': 纯解说 - 'original_heavy': 原声为主 - 'music_video': 音乐视频 Returns: Dict[str, float]: 推荐的音量配置 """ recommendations = { 'mixed': { 'tts_volume': 0.8, 'original_volume': 1.3, 'bgm_volume': 0.3, }, 'voice_only': { 'tts_volume': 1.0, 'original_volume': 0.5, 'bgm_volume': 0.2, }, 'original_heavy': { 'tts_volume': 0.6, 'original_volume': 1.6, 'bgm_volume': 0.1, }, 'music_video': { 'tts_volume': 0.7, 'original_volume': 1.8, 'bgm_volume': 0.0, # 不添加额外BGM } } return recommendations.get(content_type, recommendations['mixed']) if __name__ == "__main__": # 测试配置 config = AudioConfig() # 测试不同类型的音量配置 for video_type in ['default', 'educational', 'entertainment', 'news']: volumes = config.get_optimized_volumes(video_type) print(f"{video_type}: {volumes}") # 测试配置文件 for profile in ['balanced', 'voice_focused', 'original_focused']: volumes = config.apply_volume_profile(profile) print(f"{profile}: {volumes}") ================================================ FILE: app/config/config.py ================================================ import os import socket import toml import shutil from loguru import logger root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) config_file = f"{root_dir}/config.toml" version_file = f"{root_dir}/project_version" def get_version_from_file(): """从project_version文件中读取版本号""" try: if os.path.isfile(version_file): with open(version_file, "r", encoding="utf-8") as f: return f.read().strip() return "0.1.0" # 默认版本号 except Exception as e: logger.error(f"读取版本号文件失败: {str(e)}") return "0.1.0" # 默认版本号 def load_config(): # fix: IsADirectoryError: [Errno 21] Is a directory: '/NarratoAI/config.toml' if os.path.isdir(config_file): shutil.rmtree(config_file) if not os.path.isfile(config_file): example_file = f"{root_dir}/config.example.toml" if os.path.isfile(example_file): shutil.copyfile(example_file, config_file) logger.info(f"copy config.example.toml to config.toml") logger.info(f"load config from file: {config_file}") try: _config_ = toml.load(config_file) except Exception as e: logger.warning(f"load config failed: {str(e)}, try to load as utf-8-sig") with open(config_file, mode="r", encoding="utf-8-sig") as fp: _cfg_content = fp.read() _config_ = toml.loads(_cfg_content) return _config_ def save_config(): with open(config_file, "w", encoding="utf-8") as f: _cfg["app"] = app _cfg["proxy"] = proxy _cfg["azure"] = azure _cfg["tencent"] = tencent _cfg["soulvoice"] = soulvoice _cfg["ui"] = ui _cfg["tts_qwen"] = tts_qwen _cfg["indextts2"] = indextts2 f.write(toml.dumps(_cfg)) _cfg = load_config() app = _cfg.get("app", {}) whisper = _cfg.get("whisper", {}) proxy = _cfg.get("proxy", {}) azure = _cfg.get("azure", {}) tencent = _cfg.get("tencent", {}) soulvoice = _cfg.get("soulvoice", {}) ui = _cfg.get("ui", {}) frames = _cfg.get("frames", {}) tts_qwen = _cfg.get("tts_qwen", {}) indextts2 = _cfg.get("indextts2", {}) hostname = socket.gethostname() log_level = _cfg.get("log_level", "DEBUG") listen_host = _cfg.get("listen_host", "0.0.0.0") listen_port = _cfg.get("listen_port", 8080) project_name = _cfg.get("project_name", "NarratoAI") project_description = _cfg.get( "project_description", "https://github.com/linyqh/NarratoAI", ) # 从文件读取版本号,而不是从配置文件中获取 project_version = get_version_from_file() reload_debug = False imagemagick_path = app.get("imagemagick_path", "") if imagemagick_path and os.path.isfile(imagemagick_path): os.environ["IMAGEMAGICK_BINARY"] = imagemagick_path ffmpeg_path = app.get("ffmpeg_path", "") if ffmpeg_path and os.path.isfile(ffmpeg_path): os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path logger.info(f"{project_name} v{project_version}") ================================================ FILE: app/config/ffmpeg_config.py ================================================ """ FFmpeg 配置管理模块 专门用于管理 FFmpeg 兼容性设置和优化参数 """ import os import platform from typing import Dict, List, Optional from dataclasses import dataclass from loguru import logger @dataclass class FFmpegProfile: """FFmpeg 配置文件""" name: str description: str hwaccel_enabled: bool hwaccel_type: Optional[str] encoder: str quality_preset: str pixel_format: str additional_args: List[str] compatibility_level: int # 1-5, 5为最高兼容性 class FFmpegConfigManager: """FFmpeg 配置管理器""" # 预定义的配置文件 PROFILES = { # 高性能配置(适用于现代硬件) "high_performance": FFmpegProfile( name="high_performance", description="高性能配置(NVIDIA/AMD 独立显卡)", hwaccel_enabled=True, hwaccel_type="auto", encoder="auto", quality_preset="fast", pixel_format="yuv420p", additional_args=["-preset", "fast"], compatibility_level=2 ), # 兼容性配置(适用于有问题的硬件) "compatibility": FFmpegProfile( name="compatibility", description="兼容性配置(解决滤镜链问题)", hwaccel_enabled=False, hwaccel_type=None, encoder="libx264", quality_preset="medium", pixel_format="yuv420p", additional_args=["-preset", "medium", "-crf", "23"], compatibility_level=5 ), # Windows N 卡优化配置 "windows_nvidia": FFmpegProfile( name="windows_nvidia", description="Windows NVIDIA 显卡优化配置", hwaccel_enabled=True, hwaccel_type="nvenc_pure", # 纯编码器,避免解码问题 encoder="h264_nvenc", quality_preset="medium", pixel_format="yuv420p", additional_args=["-preset", "medium", "-cq", "23"], compatibility_level=3 ), # macOS 优化配置 "macos_videotoolbox": FFmpegProfile( name="macos_videotoolbox", description="macOS VideoToolbox 优化配置", hwaccel_enabled=True, hwaccel_type="videotoolbox", encoder="h264_videotoolbox", quality_preset="medium", pixel_format="yuv420p", additional_args=["-q:v", "65"], compatibility_level=3 ), # 通用软件配置 "universal_software": FFmpegProfile( name="universal_software", description="通用软件编码配置(最高兼容性)", hwaccel_enabled=False, hwaccel_type=None, encoder="libx264", quality_preset="medium", pixel_format="yuv420p", additional_args=["-preset", "medium", "-crf", "23"], compatibility_level=5 ) } @classmethod def get_recommended_profile(cls) -> str: """ 根据系统环境推荐最佳配置文件 Returns: str: 推荐的配置文件名称 """ system = platform.system().lower() # 检测硬件加速可用性 try: from app.utils import ffmpeg_utils hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() hwaccel_available = hwaccel_info.get("available", False) hwaccel_type = hwaccel_info.get("type", "software") gpu_vendor = hwaccel_info.get("gpu_vendor", "unknown") except Exception as e: logger.warning(f"无法检测硬件加速信息: {e}") hwaccel_available = False hwaccel_type = "software" gpu_vendor = "unknown" # 根据平台和硬件推荐配置 if system == "windows": if hwaccel_available and gpu_vendor == "nvidia": return "windows_nvidia" elif hwaccel_available: return "high_performance" else: return "compatibility" elif system == "darwin": if hwaccel_available and hwaccel_type == "videotoolbox": return "macos_videotoolbox" else: return "universal_software" elif system == "linux": if hwaccel_available: return "high_performance" else: return "universal_software" else: return "universal_software" @classmethod def get_profile(cls, profile_name: str) -> FFmpegProfile: """ 获取指定的配置文件 Args: profile_name: 配置文件名称 Returns: FFmpegProfile: 配置文件对象 """ if profile_name not in cls.PROFILES: logger.warning(f"未知的配置文件: {profile_name},使用默认配置") profile_name = "universal_software" return cls.PROFILES[profile_name] @classmethod def get_extraction_command(cls, input_path: str, output_path: str, timestamp: float, profile_name: Optional[str] = None) -> List[str]: """ 根据配置文件生成关键帧提取命令 Args: input_path: 输入视频路径 output_path: 输出图片路径 timestamp: 时间戳 profile_name: 配置文件名称,None 表示自动选择 Returns: List[str]: FFmpeg 命令列表 """ if profile_name is None: profile_name = cls.get_recommended_profile() profile = cls.get_profile(profile_name) # 构建基础命令 cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", ] # 添加硬件加速参数 if profile.hwaccel_enabled and profile.hwaccel_type: if profile.hwaccel_type == "auto": # 自动检测硬件加速 try: from app.utils import ffmpeg_utils hw_args = ffmpeg_utils.get_ffmpeg_hwaccel_args() cmd.extend(hw_args) except Exception: pass elif profile.hwaccel_type == "nvenc_pure": # 纯 NVENC 编码器,不使用硬件解码 pass else: # 指定的硬件加速类型 cmd.extend(["-hwaccel", profile.hwaccel_type]) # 添加输入参数 cmd.extend([ "-ss", str(timestamp), "-i", input_path, "-vframes", "1", ]) # 添加质量和格式参数 if profile.encoder == "libx264": cmd.extend(["-q:v", "2"]) elif profile.encoder == "h264_nvenc": cmd.extend(["-cq", "23"]) elif profile.encoder == "h264_videotoolbox": cmd.extend(["-q:v", "65"]) else: cmd.extend(["-q:v", "2"]) # 添加像素格式 cmd.extend(["-pix_fmt", profile.pixel_format]) # 添加额外参数 cmd.extend(profile.additional_args) # 添加输出参数 cmd.extend(["-y", output_path]) return cmd @classmethod def list_profiles(cls) -> Dict[str, str]: """ 列出所有可用的配置文件 Returns: Dict[str, str]: 配置文件名称到描述的映射 """ return {name: profile.description for name, profile in cls.PROFILES.items()} @classmethod def get_compatibility_report(cls) -> Dict[str, any]: """ 生成兼容性报告 Returns: Dict: 兼容性报告 """ recommended_profile = cls.get_recommended_profile() profile = cls.get_profile(recommended_profile) try: from app.utils import ffmpeg_utils hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() except Exception: hwaccel_info = {"available": False, "message": "检测失败"} return { "system": platform.system(), "recommended_profile": recommended_profile, "profile_description": profile.description, "compatibility_level": profile.compatibility_level, "hardware_acceleration": hwaccel_info, "suggestions": cls._get_suggestions(profile, hwaccel_info) } @classmethod def _get_suggestions(cls, profile: FFmpegProfile, hwaccel_info: Dict) -> List[str]: """生成优化建议""" suggestions = [] if not hwaccel_info.get("available", False): suggestions.append("建议更新显卡驱动以启用硬件加速") if profile.compatibility_level >= 4: suggestions.append("当前使用高兼容性配置,性能可能较低") if platform.system().lower() == "windows" and "nvidia" in hwaccel_info.get("gpu_vendor", "").lower(): suggestions.append("Windows NVIDIA 用户建议使用纯编码器模式避免滤镜链问题") return suggestions ================================================ FILE: app/models/__init__.py ================================================ ================================================ FILE: app/models/const.py ================================================ PUNCTUATIONS = [ "?", ",", ".", "、", ";", ":", "!", "…", "?", ",", "。", "、", ";", ":", "!", "...", ] TASK_STATE_FAILED = -1 TASK_STATE_COMPLETE = 1 TASK_STATE_PROCESSING = 4 FILE_TYPE_VIDEOS = ["mp4", "mov", "mkv", "webm"] FILE_TYPE_IMAGES = ["jpg", "jpeg", "png", "bmp"] ================================================ FILE: app/models/exception.py ================================================ import traceback from typing import Any from loguru import logger class HttpException(Exception): def __init__( self, task_id: str, status_code: int, message: str = "", data: Any = None ): self.message = message self.status_code = status_code self.data = data # 获取异常堆栈信息 tb_str = traceback.format_exc().strip() if not tb_str or tb_str == "NoneType: None": msg = f"HttpException: {status_code}, {task_id}, {message}" else: msg = f"HttpException: {status_code}, {task_id}, {message}\n{tb_str}" if status_code == 400: logger.warning(msg) else: logger.error(msg) class FileNotFoundException(Exception): pass ================================================ FILE: app/models/schema.py ================================================ import warnings from enum import Enum from typing import Any, List, Optional, Union import pydantic from pydantic import BaseModel, Field # 忽略 Pydantic 的特定警告 warnings.filterwarnings( "ignore", category=UserWarning, message="Field name.*shadows an attribute in parent.*", ) class AudioVolumeDefaults: """音量配置默认值常量类 - 确保全局一致性""" # 语音音量默认值 VOICE_VOLUME = 1.0 TTS_VOLUME = 1.0 # 原声音量默认值 - 提高原声音量以平衡TTS ORIGINAL_VOLUME = 1.2 # 背景音乐音量默认值 BGM_VOLUME = 0.3 # 音量范围 MIN_VOLUME = 0.0 MAX_VOLUME = 2.0 # 允许原声音量超过1.0以平衡TTS # 智能音量调整 ENABLE_SMART_VOLUME = True # 是否启用智能音量分析和调整 class VideoConcatMode(str, Enum): random = "random" sequential = "sequential" class VideoAspect(str, Enum): landscape = "16:9" landscape_2 = "4:3" portrait = "9:16" portrait_2 = "3:4" square = "1:1" def to_resolution(self): if self == VideoAspect.landscape.value: return 1920, 1080 elif self == VideoAspect.portrait.value: return 1080, 1920 elif self == VideoAspect.square.value: return 1080, 1080 return 1080, 1920 class _Config: arbitrary_types_allowed = True @pydantic.dataclasses.dataclass(config=_Config) class MaterialInfo: provider: str = "pexels" url: str = "" duration: int = 0 # VoiceNames = [ # # zh-CN # "female-zh-CN-XiaoxiaoNeural", # "female-zh-CN-XiaoyiNeural", # "female-zh-CN-liaoning-XiaobeiNeural", # "female-zh-CN-shaanxi-XiaoniNeural", # # "male-zh-CN-YunjianNeural", # "male-zh-CN-YunxiNeural", # "male-zh-CN-YunxiaNeural", # "male-zh-CN-YunyangNeural", # # # "female-zh-HK-HiuGaaiNeural", # # "female-zh-HK-HiuMaanNeural", # # "male-zh-HK-WanLungNeural", # # # # "female-zh-TW-HsiaoChenNeural", # # "female-zh-TW-HsiaoYuNeural", # # "male-zh-TW-YunJheNeural", # # # en-US # "female-en-US-AnaNeural", # "female-en-US-AriaNeural", # "female-en-US-AvaNeural", # "female-en-US-EmmaNeural", # "female-en-US-JennyNeural", # "female-en-US-MichelleNeural", # # "male-en-US-AndrewNeural", # "male-en-US-BrianNeural", # "male-en-US-ChristopherNeural", # "male-en-US-EricNeural", # "male-en-US-GuyNeural", # "male-en-US-RogerNeural", # "male-en-US-SteffanNeural", # ] class VideoParams(BaseModel): """ { "video_subject": "", "video_aspect": "横屏 16:9(西瓜视频)", "voice_name": "女生-晓晓", "bgm_name": "random", "font_name": "STHeitiMedium 黑体-中", "text_color": "#FFFFFF", "font_size": 60, "stroke_color": "#000000", "stroke_width": 1.5 } """ video_subject: str video_script: str = "" # 用于生成视频的脚本 video_terms: Optional[Union[str, list]] = None # 用于生成视频的关键词 video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value video_clip_duration: Optional[int] = 5 video_count: Optional[int] = 1 video_source: Optional[str] = "pexels" video_materials: Optional[List[MaterialInfo]] = None # 用于生成视频的素材 video_language: Optional[str] = "" # auto detect voice_name: Optional[str] = "" voice_volume: Optional[float] = AudioVolumeDefaults.VOICE_VOLUME voice_rate: Optional[float] = 1.0 bgm_type: Optional[str] = "random" bgm_file: Optional[str] = "" bgm_volume: Optional[float] = AudioVolumeDefaults.BGM_VOLUME subtitle_enabled: Optional[bool] = True subtitle_position: Optional[str] = "bottom" # top, bottom, center custom_position: float = 70.0 font_name: Optional[str] = "STHeitiMedium.ttc" text_fore_color: Optional[str] = "#FFFFFF" text_background_color: Optional[str] = "transparent" font_size: int = 60 stroke_color: Optional[str] = "#000000" stroke_width: float = 1.5 n_threads: Optional[int] = 2 paragraph_number: Optional[int] = 1 class VideoClipParams(BaseModel): """ NarratoAI 数据模型 """ video_clip_json: Optional[list] = Field(default=[], description="LLM 生成的视频剪辑脚本内容") video_clip_json_path: Optional[str] = Field(default="", description="LLM 生成的视频剪辑脚本路径") video_origin_path: Optional[str] = Field(default="", description="原视频路径") video_aspect: Optional[VideoAspect] = Field(default=VideoAspect.portrait.value, description="视频比例") video_language: Optional[str] = Field(default="zh-CN", description="视频语言") # video_clip_duration: Optional[int] = 5 # 视频片段时长 # video_count: Optional[int] = 1 # 视频片段数量 # video_source: Optional[str] = "local" # video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value voice_name: Optional[str] = Field(default="zh-CN-YunjianNeural", description="语音名称") voice_volume: Optional[float] = Field(default=AudioVolumeDefaults.VOICE_VOLUME, description="解说语音音量") voice_rate: Optional[float] = Field(default=1.0, description="语速") voice_pitch: Optional[float] = Field(default=1.0, description="语调") tts_engine: Optional[str] = Field(default="", description="TTS 引擎") bgm_name: Optional[str] = Field(default="random", description="背景音乐名称") bgm_type: Optional[str] = Field(default="random", description="背景音乐类型") bgm_file: Optional[str] = Field(default="", description="背景音乐文件") subtitle_enabled: bool = True font_name: str = "SimHei" # 默认使用黑体 font_size: int = 36 text_fore_color: str = "white" # 文本前景色 text_back_color: Optional[str] = None # 文本背景色 stroke_color: str = "black" # 描边颜色 stroke_width: float = 1.5 # 描边宽度 subtitle_position: str = "bottom" # top, bottom, center, custom custom_position: float = 70.0 # 自定义位置 n_threads: Optional[int] = Field(default=16, description="线程数") # 线程数,有助于提升视频处理速度 tts_volume: Optional[float] = Field(default=AudioVolumeDefaults.TTS_VOLUME, description="解说语音音量(后处理)") original_volume: Optional[float] = Field(default=AudioVolumeDefaults.ORIGINAL_VOLUME, description="视频原声音量") bgm_volume: Optional[float] = Field(default=AudioVolumeDefaults.BGM_VOLUME, description="背景音乐音量") class SubtitlePosition(str, Enum): TOP = "top" CENTER = "center" BOTTOM = "bottom" ================================================ FILE: app/services/SDE/short_drama_explanation.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : 短剧解说 @Author : 小林同学 @Date : 2025/5/9 上午12:36 ''' import os import json import requests from typing import Dict, Any, Optional from loguru import logger from app.config import config from app.utils.utils import get_uuid, storage_dir from app.services.subtitle_text import read_subtitle_text # 导入新的提示词管理系统 from app.services.prompts import PromptManager class SubtitleAnalyzer: """字幕剧情分析器,负责分析字幕内容并提取关键剧情段落""" def __init__( self, api_key: Optional[str] = None, model: Optional[str] = None, base_url: Optional[str] = None, custom_prompt: Optional[str] = None, temperature: Optional[float] = 1.0, provider: Optional[str] = None, ): """ 初始化字幕分析器 Args: api_key: API密钥,如果不提供则从配置中读取 model: 模型名称,如果不提供则从配置中读取 base_url: API基础URL,如果不提供则从配置中读取或使用默认值 custom_prompt: 自定义提示词,如果不提供则使用默认值 temperature: 模型温度 provider: 提供商类型,用于确定API调用格式 """ # 使用传入的参数或从配置中获取 self.api_key = api_key self.model = model self.base_url = base_url self.temperature = temperature self.provider = provider or self._detect_provider() # 设置自定义提示词(如果提供) self.custom_prompt = custom_prompt # 根据提供商类型确定是否为原生Gemini self.is_native_gemini = self.provider.lower() == 'gemini' # 初始化HTTP请求所需的头信息 self._init_headers() def _detect_provider(self): """根据配置自动检测提供商类型""" return config.app.get('text_llm_provider', 'gemini').lower() def _init_headers(self): """初始化HTTP请求头""" try: # 基础请求头,包含API密钥和内容类型 self.headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } # logger.debug(f"初始化成功 - API Key: {self.api_key[:8]}... - Base URL: {self.base_url}") except Exception as e: logger.error(f"初始化请求头失败: {str(e)}") raise def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]: """ 分析字幕内容 Args: subtitle_content: 字幕内容文本 Returns: Dict[str, Any]: 包含分析结果的字典 """ try: # 构建完整提示词 if self.custom_prompt: # 使用自定义提示词 prompt = f"{self.custom_prompt}\n\n{subtitle_content}" else: # 使用新的提示词管理系统,正确传入参数 prompt = PromptManager.get_prompt( category="short_drama_narration", name="plot_analysis", parameters={"subtitle_content": subtitle_content} ) if self.is_native_gemini: # 使用原生Gemini API格式 return self._call_native_gemini_api(prompt) else: # 使用OpenAI兼容格式 return self._call_openai_compatible_api(prompt) except Exception as e: logger.error(f"字幕分析过程中发生错误: {str(e)}") return { "status": "error", "message": str(e), "temperature": self.temperature } def _call_native_gemini_api(self, prompt: str) -> Dict[str, Any]: """调用原生Gemini API""" try: # 构建原生Gemini API请求数据 payload = { "systemInstruction": { "parts": [{"text": "你是一位专业的剧本分析师和剧情概括助手。请严格按照要求的格式输出分析结果。"}] }, "contents": [{ "parts": [{"text": prompt}] }], "generationConfig": { "temperature": self.temperature, "topK": 40, "topP": 0.95, "maxOutputTokens": 64000, "candidateCount": 1 }, "safetySettings": [ { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" } ] } # 构建请求URL url = f"{self.base_url}/models/{self.model}:generateContent" # 发送请求 response = requests.post( url, json=payload, headers={"Content-Type": "application/json", "x-goog-api-key": self.api_key}, timeout=120 ) if response.status_code == 200: response_data = response.json() # 检查响应格式 if "candidates" not in response_data or not response_data["candidates"]: return { "status": "error", "message": "原生Gemini API返回无效响应,可能触发了安全过滤", "temperature": self.temperature } candidate = response_data["candidates"][0] # 检查是否被安全过滤阻止 if "finishReason" in candidate and candidate["finishReason"] == "SAFETY": return { "status": "error", "message": "内容被Gemini安全过滤器阻止", "temperature": self.temperature } if "content" not in candidate or "parts" not in candidate["content"]: return { "status": "error", "message": "原生Gemini API返回内容格式错误", "temperature": self.temperature } # 提取文本内容 analysis_result = "" for part in candidate["content"]["parts"]: if "text" in part: analysis_result += part["text"] if not analysis_result.strip(): return { "status": "error", "message": "原生Gemini API返回空内容", "temperature": self.temperature } logger.debug(f"原生Gemini字幕分析完成") return { "status": "success", "analysis": analysis_result, "tokens_used": response_data.get("usage", {}).get("total_tokens", 0), "model": self.model, "temperature": self.temperature } else: error_msg = f"原生Gemini API请求失败,状态码: {response.status_code}, 响应: {response.text}" logger.error(error_msg) return { "status": "error", "message": error_msg, "temperature": self.temperature } except Exception as e: logger.error(f"原生Gemini API调用失败: {str(e)}") return { "status": "error", "message": f"原生Gemini API调用失败: {str(e)}", "temperature": self.temperature } def _call_openai_compatible_api(self, prompt: str) -> Dict[str, Any]: """调用OpenAI兼容的API""" try: # 构建OpenAI格式的请求数据 payload = { "model": self.model, "messages": [ {"role": "system", "content": "你是一位专业的剧本分析师和剧情概括助手。"}, {"role": "user", "content": prompt} ], "temperature": self.temperature } # 构建请求地址 url = f"{self.base_url}/chat/completions" # 发送HTTP请求 response = requests.post(url, headers=self.headers, json=payload, timeout=120) # 解析响应 if response.status_code == 200: response_data = response.json() # 提取响应内容 if "choices" in response_data and len(response_data["choices"]) > 0: analysis_result = response_data["choices"][0]["message"]["content"] logger.debug(f"OpenAI兼容API字幕分析完成,消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}") # 返回结果 return { "status": "success", "analysis": analysis_result, "tokens_used": response_data.get("usage", {}).get("total_tokens", 0), "model": self.model, "temperature": self.temperature } else: logger.error("OpenAI兼容API字幕分析失败: 未获取到有效响应") return { "status": "error", "message": "未获取到有效响应", "temperature": self.temperature } else: error_msg = f"OpenAI兼容API请求失败,状态码: {response.status_code}, 响应: {response.text}" logger.error(error_msg) return { "status": "error", "message": error_msg, "temperature": self.temperature } except Exception as e: logger.error(f"OpenAI兼容API调用失败: {str(e)}") return { "status": "error", "message": f"OpenAI兼容API调用失败: {str(e)}", "temperature": self.temperature } def analyze_subtitle_from_file(self, subtitle_file_path: str) -> Dict[str, Any]: """ 从文件读取字幕并分析 Args: subtitle_file_path: 字幕文件的路径 Returns: Dict[str, Any]: 包含分析结果的字典 """ try: # 检查文件是否存在 if not os.path.exists(subtitle_file_path): return { "status": "error", "message": f"字幕文件不存在: {subtitle_file_path}", "temperature": self.temperature } # 读取文件内容 subtitle_content = read_subtitle_text(subtitle_file_path).text if not subtitle_content: return { "status": "error", "message": f"字幕文件内容为空或无法读取: {subtitle_file_path}", "temperature": self.temperature } # 分析字幕 return self.analyze_subtitle(subtitle_content) except Exception as e: logger.error(f"从文件读取字幕并分析过程中发生错误: {str(e)}") return { "status": "error", "message": str(e), "temperature": self.temperature } def save_analysis_result(self, analysis_result: Dict[str, Any], output_path: Optional[str] = None) -> str: """ 保存分析结果到文件 Args: analysis_result: 分析结果 output_path: 输出文件路径,如果不提供则自动生成 Returns: str: 输出文件的路径 """ try: # 如果未提供输出路径,则自动生成 if not output_path: output_dir = storage_dir("drama_analysis", create=True) output_path = os.path.join(output_dir, f"analysis_{get_uuid(True)}.txt") # 确保目录存在 os.makedirs(os.path.dirname(output_path), exist_ok=True) # 保存结果 with open(output_path, 'w', encoding='utf-8') as f: if analysis_result["status"] == "success": f.write(analysis_result["analysis"]) else: f.write(f"分析失败: {analysis_result['message']}") logger.info(f"分析结果已保存到: {output_path}") return output_path except Exception as e: logger.error(f"保存分析结果时发生错误: {str(e)}") return "" def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]: """ 根据剧情分析生成解说文案 Args: short_name: 短剧名称 plot_analysis: 剧情分析内容 subtitle_content: 原始字幕内容,用于提供准确的时间戳信息 temperature: 生成温度,控制创造性,默认0.7 Returns: Dict[str, Any]: 包含生成结果的字典 """ try: # 使用新的提示词管理系统构建提示词 prompt = PromptManager.get_prompt( category="short_drama_narration", name="script_generation", parameters={ "drama_name": short_name, "plot_analysis": plot_analysis, "subtitle_content": subtitle_content } ) if self.is_native_gemini: # 使用原生Gemini API格式 return self._generate_narration_with_native_gemini(prompt, temperature) else: # 使用OpenAI兼容格式 return self._generate_narration_with_openai_compatible(prompt, temperature) except Exception as e: logger.error(f"解说文案生成过程中发生错误: {str(e)}") return { "status": "error", "message": str(e), "temperature": self.temperature } def _generate_narration_with_native_gemini(self, prompt: str, temperature: float) -> Dict[str, Any]: """使用原生Gemini API生成解说文案""" try: # 构建原生Gemini API请求数据 # 为了确保JSON输出,在提示词中添加更强的约束 enhanced_prompt = f"{prompt}\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" payload = { "systemInstruction": { "parts": [{"text": "你是一位专业的短视频解说脚本撰写专家。你必须严格按照JSON格式输出,不能包含任何其他文字、说明或代码块标记。"}] }, "contents": [{ "parts": [{"text": enhanced_prompt}] }], "generationConfig": { "temperature": temperature, "topK": 40, "topP": 0.95, "maxOutputTokens": 64000, "candidateCount": 1, "stopSequences": ["```", "注意", "说明"] }, "safetySettings": [ { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" } ] } # 构建请求URL url = f"{self.base_url}/models/{self.model}:generateContent" # 发送请求 response = requests.post( url, json=payload, headers={"Content-Type": "application/json", "x-goog-api-key": self.api_key}, timeout=120 ) if response.status_code == 200: response_data = response.json() # 检查响应格式 if "candidates" not in response_data or not response_data["candidates"]: return { "status": "error", "message": "原生Gemini API返回无效响应,可能触发了安全过滤", "temperature": temperature } candidate = response_data["candidates"][0] # 检查是否被安全过滤阻止 if "finishReason" in candidate and candidate["finishReason"] == "SAFETY": return { "status": "error", "message": "内容被Gemini安全过滤器阻止", "temperature": temperature } if "content" not in candidate or "parts" not in candidate["content"]: return { "status": "error", "message": "原生Gemini API返回内容格式错误", "temperature": temperature } # 提取文本内容 narration_script = "" for part in candidate["content"]["parts"]: if "text" in part: narration_script += part["text"] if not narration_script.strip(): return { "status": "error", "message": "原生Gemini API返回空内容", "temperature": temperature } logger.debug(f"原生Gemini解说文案生成完成") return { "status": "success", "narration_script": narration_script, "tokens_used": response_data.get("usage", {}).get("total_tokens", 0), "model": self.model, "temperature": temperature } else: error_msg = f"原生Gemini API请求失败,状态码: {response.status_code}, 响应: {response.text}" logger.error(error_msg) return { "status": "error", "message": error_msg, "temperature": temperature } except Exception as e: logger.error(f"原生Gemini API解说文案生成失败: {str(e)}") return { "status": "error", "message": f"原生Gemini API解说文案生成失败: {str(e)}", "temperature": temperature } def _generate_narration_with_openai_compatible(self, prompt: str, temperature: float) -> Dict[str, Any]: """使用OpenAI兼容API生成解说文案""" try: # 构建OpenAI格式的请求数据 payload = { "model": self.model, "messages": [ {"role": "system", "content": "你是一位专业的短视频解说脚本撰写专家。"}, {"role": "user", "content": prompt} ], "temperature": temperature } # 对特定模型添加响应格式设置 if self.model not in ["deepseek-reasoner"]: payload["response_format"] = {"type": "json_object"} # 构建请求地址 url = f"{self.base_url}/chat/completions" # 发送HTTP请求 response = requests.post(url, headers=self.headers, json=payload, timeout=120) # 解析响应 if response.status_code == 200: response_data = response.json() # 提取响应内容 if "choices" in response_data and len(response_data["choices"]) > 0: narration_script = response_data["choices"][0]["message"]["content"] logger.debug(f"OpenAI兼容API解说文案生成完成,消耗的tokens: {response_data.get('usage', {}).get('total_tokens', 0)}") # 返回结果 return { "status": "success", "narration_script": narration_script, "tokens_used": response_data.get("usage", {}).get("total_tokens", 0), "model": self.model, "temperature": temperature } else: logger.error("OpenAI兼容API解说文案生成失败: 未获取到有效响应") return { "status": "error", "message": "未获取到有效响应", "temperature": temperature } else: error_msg = f"OpenAI兼容API请求失败,状态码: {response.status_code}, 响应: {response.text}" logger.error(error_msg) return { "status": "error", "message": error_msg, "temperature": temperature } except Exception as e: logger.error(f"OpenAI兼容API解说文案生成失败: {str(e)}") return { "status": "error", "message": f"OpenAI兼容API解说文案生成失败: {str(e)}", "temperature": temperature } def save_narration_script(self, narration_result: Dict[str, Any], output_path: Optional[str] = None) -> str: """ 保存解说文案到文件 Args: narration_result: 解说文案生成结果 output_path: 输出文件路径,如果不提供则自动生成 Returns: str: 输出文件的路径 """ try: # 如果未提供输出路径,则自动生成 if not output_path: output_dir = storage_dir("narration_scripts", create=True) output_path = os.path.join(output_dir, f"narration_{get_uuid(True)}.json") # 确保目录存在 os.makedirs(os.path.dirname(output_path), exist_ok=True) # 保存结果 with open(output_path, 'w', encoding='utf-8') as f: if narration_result["status"] == "success": f.write(narration_result["narration_script"]) else: f.write(f"生成失败: {narration_result['message']}") logger.info(f"解说文案已保存到: {output_path}") return output_path except Exception as e: logger.error(f"保存解说文案时发生错误: {str(e)}") return "" def analyze_subtitle( subtitle_content: str = None, subtitle_file_path: str = None, api_key: Optional[str] = None, model: Optional[str] = None, base_url: Optional[str] = None, custom_prompt: Optional[str] = None, temperature: float = 1.0, save_result: bool = False, output_path: Optional[str] = None, provider: Optional[str] = None ) -> Dict[str, Any]: """ 分析字幕内容的便捷函数 Args: subtitle_content: 字幕内容文本 subtitle_file_path: 字幕文件路径 custom_prompt: 自定义提示词 api_key: API密钥 model: 模型名称 base_url: API基础URL temperature: 模型温度 save_result: 是否保存结果到文件 output_path: 输出文件路径 provider: 提供商类型 Returns: Dict[str, Any]: 包含分析结果的字典 """ # 初始化分析器 analyzer = SubtitleAnalyzer( temperature=temperature, api_key=api_key, model=model, base_url=base_url, custom_prompt=custom_prompt, provider=provider ) logger.debug(f"使用模型: {analyzer.model} 开始分析, 温度: {analyzer.temperature}") # 分析字幕 if subtitle_content: result = analyzer.analyze_subtitle(subtitle_content) elif subtitle_file_path: result = analyzer.analyze_subtitle_from_file(subtitle_file_path) else: return { "status": "error", "message": "必须提供字幕内容或字幕文件路径", "temperature": temperature } # 保存结果 if save_result and result["status"] == "success": result["output_path"] = analyzer.save_analysis_result(result, output_path) return result def generate_narration_script( short_name: str = None, plot_analysis: str = None, subtitle_content: str = None, api_key: Optional[str] = None, model: Optional[str] = None, base_url: Optional[str] = None, temperature: float = 1.0, save_result: bool = False, output_path: Optional[str] = None, provider: Optional[str] = None ) -> Dict[str, Any]: """ 根据剧情分析生成解说文案的便捷函数 Args: short_name: 短剧名称 plot_analysis: 剧情分析内容,直接提供 subtitle_content: 原始字幕内容,用于提供准确的时间戳信息 api_key: API密钥 model: 模型名称 base_url: API基础URL temperature: 生成温度,控制创造性 save_result: 是否保存结果到文件 output_path: 输出文件路径 provider: 提供商类型 Returns: Dict[str, Any]: 包含生成结果的字典 """ # 初始化分析器 analyzer = SubtitleAnalyzer( temperature=temperature, api_key=api_key, model=model, base_url=base_url, provider=provider ) # 生成解说文案 result = analyzer.generate_narration_script(short_name, plot_analysis, subtitle_content or "", temperature) # 保存结果 if save_result and result["status"] == "success": result["output_path"] = analyzer.save_narration_script(result, output_path) return result if __name__ == '__main__': text_api_key = "skxxxx" text_model = "gemini-2.0-flash" text_base_url = "https://api.narratoai.cn/v1/chat/completions" # 确保URL不以斜杠结尾,便于后续拼接 subtitle_path = "/Users/apple/Desktop/home/NarratoAI/resource/srt/家里家外1-5.srt" # 示例用法 if subtitle_path: # 分析字幕总结剧情 analysis_result = analyze_subtitle( subtitle_file_path=subtitle_path, api_key=text_api_key, model=text_model, base_url=text_base_url, save_result=True ) if analysis_result["status"] == "success": print("字幕分析成功!") print("分析结果:") print(analysis_result["analysis"]) # 读取原始字幕内容用于解说脚本生成 with open(subtitle_path, 'r', encoding='utf-8') as f: subtitle_content = f.read() # 根据剧情生成解说文案 narration_result = generate_narration_script( short_name="家里家外", plot_analysis=analysis_result["analysis"], subtitle_content=subtitle_content, api_key=text_api_key, model=text_model, base_url=text_base_url, save_result=True ) if narration_result["status"] == "success": print("\n解说文案生成成功!") print("解说文案:") print(narration_result["narration_script"]) else: print(f"\n解说文案生成失败: {narration_result['message']}") else: print(f"分析失败: {analysis_result['message']}") ================================================ FILE: app/services/SDP/generate_script_short.py ================================================ """ 视频脚本生成pipeline,串联各个处理步骤 """ from typing import Any, Dict, Optional from loguru import logger from .utils.step1_subtitle_analyzer_openai import analyze_subtitle from .utils.step5_merge_script import merge_script from app.services.upload_validation import InputValidationError, resolve_subtitle_input def generate_script_result( api_key: str, model_name: str, output_path: str, base_url: str = None, custom_clips: int = 5, provider: str = None, *, srt_path: Optional[str] = None, subtitle_content: Optional[str] = None, subtitle_file_path: Optional[str] = None, ) -> Dict[str, Any]: """生成视频混剪脚本(安全版本,返回结果字典) Args: api_key: API密钥 model_name: 模型名称 output_path: 输出文件路径 base_url: API基础URL,可选 custom_clips: 自定义片段数量,默认5 provider: LLM服务提供商,可选 srt_path: 字幕文件路径(向后兼容) subtitle_content: 字幕文本内容 subtitle_file_path: 字幕文件路径(推荐) Returns: Dict[str, Any]: 成功: {"status": "success", "script": [...]} 失败: {"status": "error", "message": "错误信息"} """ try: # 解析字幕输入源(支持内容或文件路径) resolved_content, resolved_path = resolve_subtitle_input( subtitle_content=subtitle_content, subtitle_file_path=subtitle_file_path, srt_path=srt_path, ) logger.info("开始分析字幕内容...") openai_analysis = analyze_subtitle( model_name=model_name, api_key=api_key, base_url=base_url, custom_clips=custom_clips, provider=provider, srt_path=resolved_path, subtitle_content=resolved_content, ) adjusted_results = openai_analysis['plot_points'] final_script = merge_script(adjusted_results, output_path) return {"status": "success", "script": final_script} except InputValidationError as e: logger.error(f"输入验证失败: {e}") return {"status": "error", "message": str(e)} except Exception as e: logger.exception(f"SDP 脚本生成失败: {e}") return {"status": "error", "message": f"生成脚本失败: {str(e)}"} def generate_script( srt_path: Optional[str] = None, api_key: str = None, model_name: str = None, output_path: str = None, base_url: str = None, custom_clips: int = 5, provider: str = None, *, subtitle_content: Optional[str] = None, subtitle_file_path: Optional[str] = None, ): """生成视频混剪脚本(向后兼容版本) Args: srt_path: 字幕文件路径(向后兼容参数,可选) api_key: API密钥 model_name: 模型名称 output_path: 输出文件路径 base_url: API基础URL,可选 custom_clips: 自定义片段数量,默认5 provider: LLM服务提供商,可选 subtitle_content: 字幕文本内容(可选) subtitle_file_path: 字幕文件路径(推荐使用,可选) Returns: str: 生成的脚本内容 Raises: FileNotFoundError: 字幕文件不存在(向后兼容) ValueError: 输入验证失败或脚本生成失败 """ result = generate_script_result( api_key=api_key, model_name=model_name, output_path=output_path, base_url=base_url, custom_clips=custom_clips, provider=provider, srt_path=srt_path, subtitle_content=subtitle_content, subtitle_file_path=subtitle_file_path, ) if result.get("status") != "success": error_message = result.get("message", "生成脚本失败") # 保持向后兼容:如果是文件不存在错误,抛出 FileNotFoundError if "不存在" in error_message and (srt_path or subtitle_file_path): raise FileNotFoundError(error_message) raise ValueError(error_message) return result["script"] ================================================ FILE: app/services/SDP/utils/short_schema.py ================================================ """ 定义项目中使用的数据类型 """ from typing import List, Dict, Optional from dataclasses import dataclass @dataclass class PlotPoint: timestamp: str title: str picture: str @dataclass class Commentary: timestamp: str title: str copywriter: str @dataclass class SubtitleSegment: start_time: float end_time: float text: str @dataclass class ScriptItem: timestamp: str title: str picture: str copywriter: str @dataclass class PipelineResult: output_video_path: str plot_points: List[PlotPoint] subtitle_segments: List[SubtitleSegment] commentaries: List[Commentary] final_script: List[ScriptItem] error: Optional[str] = None class VideoProcessingError(Exception): pass class SubtitleProcessingError(Exception): pass class PlotAnalysisError(Exception): pass class CopywritingError(Exception): pass ================================================ FILE: app/services/SDP/utils/step1_subtitle_analyzer_openai.py ================================================ """ 使用统一LLM服务,分析字幕文件,返回剧情梗概和爆点 """ import traceback import json from loguru import logger from app.services.subtitle_text import has_timecodes, normalize_subtitle_text, read_subtitle_text # 导入新的提示词管理系统 from app.services.prompts import PromptManager # 导入统一LLM服务 from app.services.llm.unified_service import UnifiedLLMService # 导入安全的异步执行函数 from app.services.llm.migration_adapter import _run_async_safely def analyze_subtitle( model_name: str, api_key: str = None, base_url: str = None, custom_clips: int = 5, provider: str = None, srt_path: str = None, subtitle_content: str = None ) -> dict: """分析字幕内容,返回完整的分析结果 Args: model_name (str): 大模型名称 api_key (str, optional): 大模型API密钥. Defaults to None. base_url (str, optional): 大模型API基础URL. Defaults to None. custom_clips (int): 需要提取的片段数量. Defaults to 5. provider (str, optional): LLM服务提供商. Defaults to None. srt_path (str, optional): SRT字幕文件路径(与subtitle_content二选一) subtitle_content (str, optional): SRT字幕文本内容(与srt_path二选一) Returns: dict: 包含剧情梗概和结构化的时间段分析的字典 """ try: # 读取并规范化字幕文本(不依赖结构化 SRT 解析,提升兼容性) if subtitle_content and str(subtitle_content).strip(): normalized_subtitle_text = normalize_subtitle_text(subtitle_content) source_label = "字幕内容(直接传入)" elif srt_path: decoded = read_subtitle_text(srt_path) normalized_subtitle_text = decoded.text source_label = f"字幕文件: {srt_path} (encoding: {decoded.encoding})" else: raise ValueError("必须提供 srt_path 或 subtitle_content 参数") # 基础校验:必须有内容且包含可用于定位的时间码 if not normalized_subtitle_text or len(normalized_subtitle_text.strip()) < 10: error_msg = ( f"字幕来源 [{source_label}] 内容为空或过短。\n" f"请检查:\n" f"1. 文件格式是否为标准 SRT\n" f"2. 文件编码是否为 UTF-8、UTF-16、GBK 或 GB2312\n" f"3. 文件内容是否为空" ) logger.error(error_msg) raise ValueError(error_msg) if not has_timecodes(normalized_subtitle_text): error_msg = ( f"字幕来源 [{source_label}] 未检测到有效时间码,无法进行时间段定位。\n" f"请确保字幕包含类似以下格式的时间轴:\n" f"00:00:01,000 --> 00:00:02,000\n" f"(若毫秒分隔符为'.',系统会自动规范化为',')" ) logger.error(error_msg) raise ValueError(error_msg) logger.info(f"成功加载字幕来源 [{source_label}],字符数: {len(normalized_subtitle_text)}") subtitle_content = normalized_subtitle_text # 如果没有指定provider,根据model_name推断 if not provider: if "deepseek" in model_name.lower(): provider = "deepseek" elif "gpt" in model_name.lower(): provider = "openai" elif "gemini" in model_name.lower(): provider = "gemini" else: provider = "openai" # 默认使用openai logger.info(f"使用LLM服务分析字幕,提供商: {provider}, 模型: {model_name}") # 使用新的提示词管理系统 subtitle_analysis_prompt = PromptManager.get_prompt( category="short_drama_editing", name="subtitle_analysis", parameters={ "subtitle_content": subtitle_content, "custom_clips": custom_clips } ) # 使用统一LLM服务生成文本 logger.info("开始分析字幕内容...") response = _run_async_safely( UnifiedLLMService.generate_text, prompt=subtitle_analysis_prompt, provider=provider, model=model_name, api_key=api_key, base_url=base_url, temperature=0.1, # 使用较低的温度以获得更稳定的结果 max_tokens=4000 ) # 解析JSON响应 from webui.tools.generate_short_summary import parse_and_fix_json summary_data = parse_and_fix_json(response) if not summary_data: raise Exception("无法解析LLM返回的JSON数据") logger.info(f"字幕分析完成,找到 {len(summary_data.get('plot_titles', []))} 个关键情节") logger.debug(json.dumps(summary_data, indent=4, ensure_ascii=False)) # 构建爆点标题列表 plot_titles_text = "" logger.info(f"找到 {len(summary_data.get('plot_titles', []))} 个片段") for i, point in enumerate(summary_data['plot_titles'], 1): plot_titles_text += f"{i}. {point}\n" # 使用新的提示词管理系统 plot_extraction_prompt = PromptManager.get_prompt( category="short_drama_editing", name="plot_extraction", parameters={ "subtitle_content": subtitle_content, "plot_summary": summary_data['summary'], "plot_titles": plot_titles_text } ) # 使用统一LLM服务进行爆点时间段分析 logger.info("开始分析爆点时间段...") response = _run_async_safely( UnifiedLLMService.generate_text, prompt=plot_extraction_prompt, provider=provider, model=model_name, api_key=api_key, base_url=base_url, temperature=0.1, max_tokens=4000 ) # 解析JSON响应 plot_data = parse_and_fix_json(response) if not plot_data: raise Exception("无法解析爆点分析的JSON数据") logger.info(f"爆点分析完成,找到 {len(plot_data.get('plot_points', []))} 个时间段") # 合并结果 result = { "summary": summary_data.get("summary", ""), "plot_titles": summary_data.get("plot_titles", []), "plot_points": plot_data.get("plot_points", []) } return result except Exception as e: logger.error(f"分析字幕时发生错误: {str(e)}") raise Exception(f"分析字幕时发生错误:{str(e)}\n{traceback.format_exc()}") ================================================ FILE: app/services/SDP/utils/step5_merge_script.py ================================================ """ 合并生成最终脚本 """ import os import json from typing import Dict, List def merge_script( plot_points: List[Dict], output_path: str ): """合并生成最终脚本 Args: plot_points: 校对后的剧情点 output_path: 输出文件路径,如果提供则保存到文件 Returns: str: 最终合并的脚本 """ # 创建包含所有信息的临时列表 final_script = [] # 处理原生画面条目 number = 1 for plot_point in plot_points: script_item = { "_id": number, "timestamp": plot_point["timestamp"], "picture": plot_point["picture"], "narration": f"播放原生_{os.urandom(4).hex()}", "OST": 1, # OST=0 仅保留解说 OST=2 保留解说和原声 } final_script.append(script_item) number += 1 # 保存结果 if not output_path or not str(output_path).strip(): raise ValueError("output_path不能为空") output_path = str(output_path) os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(final_script, f, ensure_ascii=False, indent=4) print(f"脚本生成完成:{output_path}") return final_script ================================================ FILE: app/services/SDP/utils/utils.py ================================================ # 公共方法 import json import requests # 新增 import pysrt from loguru import logger from typing import List, Dict def load_srt(file_path: str) -> List[Dict]: """加载并解析SRT文件(使用 pysrt 库,支持多种编码和格式) Args: file_path: SRT文件路径 Returns: 字幕内容列表,格式: [ { 'number': int, # 字幕序号 'timestamp': str, # "00:00:01,000 --> 00:00:03,000" 'text': str, # 字幕文本 'start_time': str, # "00:00:01,000" 'end_time': str # "00:00:03,000" }, ... ] Raises: FileNotFoundError: 文件不存在 ValueError: 文件编码不支持或格式错误 """ # 编码自动检测:依次尝试常见编码 encodings = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312'] subs = None detected_encoding = None for encoding in encodings: try: subs = pysrt.open(file_path, encoding=encoding) detected_encoding = encoding logger.info(f"成功加载字幕文件 {file_path},编码:{encoding},共 {len(subs)} 条") break except UnicodeDecodeError: continue except Exception as e: logger.warning(f"使用编码 {encoding} 加载失败: {e}") continue if subs is None: # 所有编码都失败 raise ValueError( f"无法读取字幕文件 {file_path}," f"请检查文件编码(支持 UTF-8、GBK、GB2312)" ) # 检查是否为空 if not subs: logger.warning(f"字幕文件 {file_path} 解析后无有效内容") return [] # 转换为原格式(向后兼容) subtitles = [] for sub in subs: # 合并多行文本为单行(某些 SRT 文件会有换行) text = sub.text.replace('\n', ' ').strip() # 跳过空字幕 if not text: continue subtitles.append({ 'number': sub.index, 'timestamp': f"{sub.start} --> {sub.end}", 'text': text, 'start_time': str(sub.start), 'end_time': str(sub.end) }) logger.info(f"成功解析 {len(subtitles)} 条有效字幕") return subtitles def load_srt_from_content(srt_content: str) -> List[Dict]: """从字符串内容解析SRT(用于直接传入字幕内容,无需依赖文件路径) Args: srt_content: SRT格式的字幕文本内容 Returns: 字幕内容列表,格式同 load_srt 函数 Raises: ValueError: 字幕内容为空或格式错误 """ if srt_content is None or not str(srt_content).strip(): raise ValueError("字幕内容为空") try: subs = pysrt.from_string(str(srt_content)) except Exception as e: logger.error(f"无法解析字幕内容: {e}") raise ValueError("无法解析字幕内容,请确保为标准 SRT 格式") from e if not subs: logger.warning("字幕内容解析后无有效内容") return [] subtitles = [] for sub in subs: text = sub.text.replace('\n', ' ').strip() if not text: continue subtitles.append({ 'number': sub.index, 'timestamp': f"{sub.start} --> {sub.end}", 'text': text, 'start_time': str(sub.start), 'end_time': str(sub.end) }) logger.info(f"成功从内容解析 {len(subtitles)} 条有效字幕") return subtitles ================================================ FILE: app/services/__init__.py ================================================ ================================================ FILE: app/services/audio_merger.py ================================================ import os import json import subprocess import edge_tts from edge_tts import submaker from pydub import AudioSegment from typing import List, Dict from loguru import logger from app.utils import utils def check_ffmpeg(): """检查FFmpeg是否已安装""" try: subprocess.run(['ffmpeg', '-version'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return True except FileNotFoundError: return False def merge_audio_files(task_id: str, total_duration: float, list_script: list): """ 合并音频文件 Args: task_id: 任务ID total_duration: 总时长 list_script: 完整脚本信息,包含duration时长和audio路径 Returns: str: 合并后的音频文件路径 """ # 检查FFmpeg是否安装 if not check_ffmpeg(): logger.error("FFmpeg未安装,无法合并音频文件") return None # 创建一个空的音频片段 final_audio = AudioSegment.silent(duration=total_duration * 1000) # 总时长以毫秒为单位 # 计算每个片段的开始位置(基于duration字段) current_position = 0 # 初始位置(秒) # 遍历脚本中的每个片段 for segment in list_script: try: # 获取片段时长(秒) duration = segment['duration'] # 检查audio字段是否为空 if segment['audio'] and os.path.exists(segment['audio']): # 加载TTS音频文件 tts_audio = AudioSegment.from_file(segment['audio']) # 将TTS音频添加到最终音频 final_audio = final_audio.overlay(tts_audio, position=current_position * 1000) else: # audio为空,不添加音频,仅保留间隔 logger.info(f"片段 {segment.get('timestamp', '')} 没有音频文件,保留 {duration} 秒的间隔") # 更新下一个片段的开始位置 current_position += duration except Exception as e: logger.error(f"处理音频片段时出错: {str(e)}") # 即使处理失败,也要更新位置,确保后续片段位置正确 if 'duration' in segment: current_position += segment['duration'] continue # 保存合并后的音频文件 output_audio_path = os.path.join(utils.task_dir(task_id), "merger_audio.mp3") final_audio.export(output_audio_path, format="mp3") logger.info(f"合并后的音频文件已保存: {output_audio_path}") return output_audio_path def time_to_seconds(time_str): """ 将时间字符串转换为秒数,支持多种格式: 1. 'HH:MM:SS,mmm' (时:分:秒,毫秒) 2. 'MM:SS,mmm' (分:秒,毫秒) 3. 'SS,mmm' (秒,毫秒) """ try: # 处理毫秒部分 if ',' in time_str: time_part, ms_part = time_str.split(',') ms = float(ms_part) / 1000 else: time_part = time_str ms = 0 # 分割时间部分 parts = time_part.split(':') if len(parts) == 3: # HH:MM:SS h, m, s = map(int, parts) seconds = h * 3600 + m * 60 + s elif len(parts) == 2: # MM:SS m, s = map(int, parts) seconds = m * 60 + s else: # SS seconds = int(parts[0]) return seconds + ms except (ValueError, IndexError) as e: logger.error(f"Error parsing time {time_str}: {str(e)}") return 0.0 def extract_timestamp(filename): """ 从文件名中提取开始和结束时间戳 例如: "audio_00_06,500-00_24,800.mp3" -> (6.5, 24.8) """ try: # 从文件名中提取时间部分 time_part = filename.split('_', 1)[1].split('.')[0] # 获取 "00_06,500-00_24,800" 部分 start_time, end_time = time_part.split('-') # 分割成开始和结束时间 # 将下划线格式转换回冒号格式 start_time = start_time.replace('_', ':') end_time = end_time.replace('_', ':') # 将时间戳转换为秒 start_seconds = time_to_seconds(start_time) end_seconds = time_to_seconds(end_time) return start_seconds, end_seconds except Exception as e: logger.error(f"Error extracting timestamp from {filename}: {str(e)}") return 0.0, 0.0 if __name__ == "__main__": # 示例用法 total_duration = 90 video_script = [ {'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', 'timestamp': '00:00:00-00:00:26', 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', 'OST': 0, 'duration': 26, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3'}, {'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', 'timestamp': '00:01:15-00:01:29', 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', 'OST': 0, 'duration': 14, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3'}, {'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41-00:04:58', 'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪', 'OST': 1, 'duration': 17, 'audio': ''}, {'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', 'timestamp': '00:04:58-00:05:20', 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', 'OST': 0, 'duration': 22, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3'}, {'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'timestamp': '00:05:45-00:05:53', 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'OST': 0, 'duration': 8, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'}, {'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。', 'timestamp': '00:06:00-00:06:03', 'narration': '抓刺客', 'OST': 1, 'duration': 3, 'audio': ''}] output_file = merge_audio_files("test456", total_duration, video_script) print(output_file) ================================================ FILE: app/services/audio_normalizer.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : audio_normalizer @Author : Viccy同学 @Date : 2025/1/7 @Description: 音频响度分析和标准化工具 ''' import os import subprocess import tempfile from typing import Optional, Tuple, Dict, Any from loguru import logger from moviepy import AudioFileClip from pydub import AudioSegment import numpy as np class AudioNormalizer: """音频响度分析和标准化工具""" def __init__(self): self.target_lufs = -23.0 # 目标响度 (LUFS),符合广播标准 self.max_peak = -1.0 # 最大峰值 (dBFS) def analyze_audio_lufs(self, audio_path: str) -> Optional[float]: """ 使用FFmpeg分析音频的LUFS响度 Args: audio_path: 音频文件路径 Returns: float: LUFS值,如果分析失败返回None """ if not os.path.exists(audio_path): logger.error(f"音频文件不存在: {audio_path}") return None try: # 使用FFmpeg的loudnorm滤镜分析音频响度 cmd = [ 'ffmpeg', '-hide_banner', '-nostats', '-i', audio_path, '-af', 'loudnorm=I=-23:TP=-1:LRA=7:print_format=json', '-f', 'null', '-' ] result = subprocess.run( cmd, capture_output=True, text=True, check=False ) # 从stderr中提取JSON信息 stderr_lines = result.stderr.split('\n') json_start = False json_lines = [] for line in stderr_lines: if line.strip() == '{': json_start = True if json_start: json_lines.append(line) if line.strip() == '}': break if json_lines: import json try: loudness_data = json.loads('\n'.join(json_lines)) input_i = float(loudness_data.get('input_i', 0)) logger.info(f"音频 {os.path.basename(audio_path)} 的LUFS: {input_i}") return input_i except (json.JSONDecodeError, ValueError) as e: logger.warning(f"解析LUFS数据失败: {e}") except Exception as e: logger.error(f"分析音频LUFS失败: {e}") return None def get_audio_rms(self, audio_path: str) -> Optional[float]: """ 计算音频的RMS值作为响度的简单估计 Args: audio_path: 音频文件路径 Returns: float: RMS值 (dB),如果计算失败返回None """ try: audio = AudioSegment.from_file(audio_path) # 转换为numpy数组 samples = np.array(audio.get_array_of_samples()) # 如果是立体声,取平均值 if audio.channels == 2: samples = samples.reshape((-1, 2)) samples = samples.mean(axis=1) # 计算RMS rms = np.sqrt(np.mean(samples**2)) # 转换为dB if rms > 0: rms_db = 20 * np.log10(rms / (2**15)) # 假设16位音频 logger.info(f"音频 {os.path.basename(audio_path)} 的RMS: {rms_db:.2f} dB") return rms_db else: return -60.0 # 静音 except Exception as e: logger.error(f"计算音频RMS失败: {e}") return None def normalize_audio_lufs(self, input_path: str, output_path: str, target_lufs: Optional[float] = None) -> bool: """ 使用FFmpeg的loudnorm滤镜标准化音频响度 Args: input_path: 输入音频文件路径 output_path: 输出音频文件路径 target_lufs: 目标LUFS值,默认使用-23.0 Returns: bool: 是否成功 """ if target_lufs is None: target_lufs = self.target_lufs try: # 第一遍:分析音频 analyze_cmd = [ 'ffmpeg', '-hide_banner', '-nostats', '-i', input_path, '-af', f'loudnorm=I={target_lufs}:TP={self.max_peak}:LRA=7:print_format=json', '-f', 'null', '-' ] analyze_result = subprocess.run( analyze_cmd, capture_output=True, text=True, check=False ) # 解析分析结果 stderr_lines = analyze_result.stderr.split('\n') json_start = False json_lines = [] for line in stderr_lines: if line.strip() == '{': json_start = True if json_start: json_lines.append(line) if line.strip() == '}': break if not json_lines: logger.warning("无法获取音频分析数据,使用简单标准化") return self._simple_normalize(input_path, output_path) import json loudness_data = json.loads('\n'.join(json_lines)) # 第二遍:应用标准化 normalize_cmd = [ 'ffmpeg', '-y', '-hide_banner', '-i', input_path, '-af', ( f'loudnorm=I={target_lufs}:TP={self.max_peak}:LRA=7:' f'measured_I={loudness_data["input_i"]}:' f'measured_LRA={loudness_data["input_lra"]}:' f'measured_TP={loudness_data["input_tp"]}:' f'measured_thresh={loudness_data["input_thresh"]}' ), '-ar', '44100', # 统一采样率 '-ac', '2', # 统一为立体声 output_path ] result = subprocess.run( normalize_cmd, capture_output=True, text=True, check=True ) logger.info(f"音频标准化完成: {output_path}") return True except subprocess.CalledProcessError as e: logger.error(f"FFmpeg标准化失败: {e}") return self._simple_normalize(input_path, output_path) except Exception as e: logger.error(f"音频标准化失败: {e}") return False def _simple_normalize(self, input_path: str, output_path: str) -> bool: """ 简单的音频标准化(备用方案) Args: input_path: 输入音频文件路径 output_path: 输出音频文件路径 Returns: bool: 是否成功 """ try: # 使用pydub进行简单的音量标准化 audio = AudioSegment.from_file(input_path) # 标准化到-20dB target_dBFS = -20.0 change_in_dBFS = target_dBFS - audio.dBFS normalized_audio = audio.apply_gain(change_in_dBFS) # 导出 normalized_audio.export(output_path, format="mp3", bitrate="128k") logger.info(f"简单音频标准化完成: {output_path}") return True except Exception as e: logger.error(f"简单音频标准化失败: {e}") return False def calculate_volume_adjustment(self, tts_path: str, original_path: str) -> Tuple[float, float]: """ 计算TTS和原声的音量调整系数,使它们达到相似的响度 Args: tts_path: TTS音频文件路径 original_path: 原声音频文件路径 Returns: Tuple[float, float]: (TTS音量系数, 原声音量系数) """ # 分析两个音频的响度 tts_lufs = self.analyze_audio_lufs(tts_path) original_lufs = self.analyze_audio_lufs(original_path) # 如果LUFS分析失败,使用RMS作为备用 if tts_lufs is None: tts_lufs = self.get_audio_rms(tts_path) if original_lufs is None: original_lufs = self.get_audio_rms(original_path) if tts_lufs is None or original_lufs is None: logger.warning("无法分析音频响度,使用默认音量设置") return 0.7, 1.0 # 默认设置 # 计算调整系数 # 目标:让两个音频达到相似的响度 target_lufs = -20.0 # 目标响度 tts_adjustment = 10 ** ((target_lufs - tts_lufs) / 20) original_adjustment = 10 ** ((target_lufs - original_lufs) / 20) # 限制调整范围,避免过度放大 tts_adjustment = max(0.1, min(2.0, tts_adjustment)) original_adjustment = max(0.1, min(3.0, original_adjustment)) # 原声可以放大更多 logger.info(f"音量调整建议 - TTS: {tts_adjustment:.2f}, 原声: {original_adjustment:.2f}") return tts_adjustment, original_adjustment def normalize_audio_for_mixing(audio_path: str, output_dir: str, target_lufs: float = -20.0) -> Optional[str]: """ 为音频混合准备标准化的音频文件 Args: audio_path: 输入音频文件路径 output_dir: 输出目录 target_lufs: 目标LUFS值 Returns: str: 标准化后的音频文件路径,失败返回None """ if not os.path.exists(audio_path): return None normalizer = AudioNormalizer() # 生成输出文件名 base_name = os.path.splitext(os.path.basename(audio_path))[0] output_path = os.path.join(output_dir, f"{base_name}_normalized.mp3") # 执行标准化 if normalizer.normalize_audio_lufs(audio_path, output_path, target_lufs): return output_path else: return None if __name__ == "__main__": # 测试代码 normalizer = AudioNormalizer() # 测试音频分析 test_audio = "/path/to/test/audio.mp3" if os.path.exists(test_audio): lufs = normalizer.analyze_audio_lufs(test_audio) rms = normalizer.get_audio_rms(test_audio) print(f"LUFS: {lufs}, RMS: {rms}") ================================================ FILE: app/services/clip_video.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : clip_video @Author : Viccy同学 @Date : 2025/5/6 下午6:14 ''' import os import subprocess import json import hashlib from loguru import logger from typing import Dict, List, Optional from pathlib import Path from app.utils import ffmpeg_utils def parse_timestamp(timestamp: str) -> tuple: """ 解析时间戳字符串,返回开始和结束时间 Args: timestamp: 格式为'HH:MM:SS-HH:MM:SS'或'HH:MM:SS,sss-HH:MM:SS,sss'的时间戳字符串 Returns: tuple: (开始时间, 结束时间) 格式为'HH:MM:SS'或'HH:MM:SS,sss' """ start_time, end_time = timestamp.split('-') return start_time, end_time def calculate_end_time(start_time: str, duration: float, extra_seconds: float = 1.0) -> str: """ 根据开始时间和持续时间计算结束时间 Args: start_time: 开始时间,格式为'HH:MM:SS'或'HH:MM:SS,sss'(带毫秒) duration: 持续时间,单位为秒 extra_seconds: 额外添加的秒数,默认为1秒 Returns: str: 计算后的结束时间,格式与输入格式相同 """ # 检查是否包含毫秒 has_milliseconds = ',' in start_time milliseconds = 0 if has_milliseconds: time_part, ms_part = start_time.split(',') h, m, s = map(int, time_part.split(':')) milliseconds = int(ms_part) else: h, m, s = map(int, start_time.split(':')) # 转换为总毫秒数 total_milliseconds = ((h * 3600 + m * 60 + s) * 1000 + milliseconds + int((duration + extra_seconds) * 1000)) # 计算新的时、分、秒、毫秒 ms_new = total_milliseconds % 1000 total_seconds = total_milliseconds // 1000 h_new = int(total_seconds // 3600) m_new = int((total_seconds % 3600) // 60) s_new = int(total_seconds % 60) # 返回与输入格式一致的时间字符串 if has_milliseconds: return f"{h_new:02d}:{m_new:02d}:{s_new:02d},{ms_new:03d}" else: return f"{h_new:02d}:{m_new:02d}:{s_new:02d}" def check_hardware_acceleration() -> Optional[str]: """ 检查系统支持的硬件加速选项 Returns: Optional[str]: 硬件加速参数,如果不支持则返回None """ # 使用集中式硬件加速检测 return ffmpeg_utils.get_ffmpeg_hwaccel_type() def get_safe_encoder_config(hwaccel_type: Optional[str] = None) -> Dict[str, str]: """ 获取安全的编码器配置,基于ffmpeg_demo.py成功方案优化 Args: hwaccel_type: 硬件加速类型 Returns: Dict[str, str]: 编码器配置字典 """ # 基础配置 - 参考ffmpeg_demo.py的成功方案 config = { "video_codec": "libx264", "audio_codec": "aac", "pixel_format": "yuv420p", "preset": "medium", "quality_param": "crf", # 质量参数类型 "quality_value": "23" # 质量值 } # 根据硬件加速类型调整配置(简化版本) if hwaccel_type in ["nvenc_pure", "nvenc_software", "cuda_careful", "nvenc", "cuda", "cuda_decode"]: # NVIDIA硬件加速 - 使用ffmpeg_demo.py中验证有效的参数 config["video_codec"] = "h264_nvenc" config["preset"] = "medium" config["quality_param"] = "cq" # CQ质量控制,而不是CRF config["quality_value"] = "23" config["pixel_format"] = "yuv420p" elif hwaccel_type == "amf": # AMD AMF编码器 config["video_codec"] = "h264_amf" config["preset"] = "balanced" config["quality_param"] = "qp_i" config["quality_value"] = "23" elif hwaccel_type == "qsv": # Intel QSV编码器 config["video_codec"] = "h264_qsv" config["preset"] = "medium" config["quality_param"] = "global_quality" config["quality_value"] = "23" elif hwaccel_type == "videotoolbox": # macOS VideoToolbox编码器 config["video_codec"] = "h264_videotoolbox" config["preset"] = "medium" config["quality_param"] = "b:v" config["quality_value"] = "5M" else: # 软件编码(默认) config["video_codec"] = "libx264" config["preset"] = "medium" config["quality_param"] = "crf" config["quality_value"] = "23" return config def build_ffmpeg_command( input_path: str, output_path: str, start_time: str, end_time: str, encoder_config: Dict[str, str], hwaccel_args: List[str] = None ) -> List[str]: """ 构建优化的ffmpeg命令,基于测试结果使用正确的硬件加速方案 重要发现:对于视频裁剪场景,CUDA硬件解码会导致滤镜链错误, 应该使用纯NVENC编码器(无硬件解码)来获得最佳兼容性 Args: input_path: 输入视频路径 output_path: 输出视频路径 start_time: 开始时间 end_time: 结束时间 encoder_config: 编码器配置 hwaccel_args: 硬件加速参数 Returns: List[str]: ffmpeg命令列表 """ cmd = ["ffmpeg", "-y"] # 关键修正:对于视频裁剪,不使用CUDA硬件解码,只使用NVENC编码器 # 这样能避免滤镜链格式转换错误,同时保持编码性能优势 if encoder_config["video_codec"] == "h264_nvenc": # 不添加硬件解码参数,让FFmpeg自动处理 # 这避免了 "Impossible to convert between the formats" 错误 pass elif hwaccel_args: # 对于其他编码器,可以使用硬件解码参数 cmd.extend(hwaccel_args) # 输入文件 cmd.extend(["-i", input_path]) # 时间范围 cmd.extend(["-ss", start_time, "-to", end_time]) # 编码器设置 cmd.extend(["-c:v", encoder_config["video_codec"]]) cmd.extend(["-c:a", encoder_config["audio_codec"]]) # 像素格式 cmd.extend(["-pix_fmt", encoder_config["pixel_format"]]) # 质量和预设参数 - 针对NVENC优化 if encoder_config["video_codec"] == "h264_nvenc": # 纯NVENC编码器配置(无硬件解码,兼容性最佳) cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-cq", encoder_config["quality_value"]]) cmd.extend(["-profile:v", "main"]) # 提高兼容性 logger.debug("使用纯NVENC编码器(无硬件解码,避免滤镜链问题)") elif encoder_config["video_codec"] == "h264_amf": # AMD AMF编码器 cmd.extend(["-quality", encoder_config["preset"]]) cmd.extend(["-qp_i", encoder_config["quality_value"]]) elif encoder_config["video_codec"] == "h264_qsv": # Intel QSV编码器 cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-global_quality", encoder_config["quality_value"]]) elif encoder_config["video_codec"] == "h264_videotoolbox": # macOS VideoToolbox编码器 cmd.extend(["-profile:v", "high"]) cmd.extend(["-b:v", encoder_config["quality_value"]]) else: # 软件编码器(libx264) cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-crf", encoder_config["quality_value"]]) # 音频设置 cmd.extend(["-ar", "44100", "-ac", "2"]) # 优化参数 cmd.extend(["-avoid_negative_ts", "make_zero"]) cmd.extend(["-movflags", "+faststart"]) # 输出文件 cmd.append(output_path) return cmd def execute_ffmpeg_with_fallback( cmd: List[str], timestamp: str, input_path: str, output_path: str, start_time: str, end_time: str ) -> bool: """ 执行ffmpeg命令,带有智能fallback机制 Args: cmd: 主要的ffmpeg命令 timestamp: 时间戳(用于日志) input_path: 输入路径 output_path: 输出路径 start_time: 开始时间 end_time: 结束时间 Returns: bool: 是否成功 """ try: # logger.debug(f"执行ffmpeg命令: {' '.join(cmd)}") # 在Windows系统上使用UTF-8编码处理输出 is_windows = os.name == 'nt' process_kwargs = { "stdout": subprocess.PIPE, "stderr": subprocess.PIPE, "text": True, "check": True } if is_windows: process_kwargs["encoding"] = 'utf-8' result = subprocess.run(cmd, **process_kwargs) # 验证输出文件 if os.path.exists(output_path) and os.path.getsize(output_path) > 0: # logger.info(f"✓ 视频裁剪成功: {timestamp}") return True else: logger.warning(f"输出文件无效: {output_path}") return False except subprocess.CalledProcessError as e: error_msg = e.stderr if e.stderr else str(e) logger.warning(f"主要命令失败: {error_msg}") # 智能错误分析 error_type = analyze_ffmpeg_error(error_msg) logger.debug(f"错误类型分析: {error_type}") # 根据错误类型选择fallback策略 if error_type == "filter_chain_error": logger.info(f"检测到滤镜链错误,尝试兼容性模式: {timestamp}") return try_compatibility_fallback(input_path, output_path, start_time, end_time, timestamp) elif error_type == "hardware_error": logger.info(f"检测到硬件加速错误,尝试软件编码: {timestamp}") return try_software_fallback(input_path, output_path, start_time, end_time, timestamp) elif error_type == "encoder_error": logger.info(f"检测到编码器错误,尝试基本编码: {timestamp}") return try_basic_fallback(input_path, output_path, start_time, end_time, timestamp) else: logger.info(f"尝试通用fallback方案: {timestamp}") return try_fallback_encoding(input_path, output_path, start_time, end_time, timestamp) except Exception as e: logger.error(f"执行ffmpeg命令时发生异常: {str(e)}") return False def analyze_ffmpeg_error(error_msg: str) -> str: """ 分析ffmpeg错误信息,返回错误类型 Args: error_msg: 错误信息 Returns: str: 错误类型 """ error_msg_lower = error_msg.lower() # 滤镜链错误 if any(keyword in error_msg_lower for keyword in [ "impossible to convert", "filter", "format", "scale", "auto_scale", "null", "parsed_null", "reinitializing filters" ]): return "filter_chain_error" # 硬件加速错误 if any(keyword in error_msg_lower for keyword in [ "cuda", "nvenc", "amf", "qsv", "d3d11va", "dxva2", "videotoolbox", "hardware", "hwaccel", "gpu", "device" ]): return "hardware_error" # 编码器错误 if any(keyword in error_msg_lower for keyword in [ "encoder", "codec", "h264", "libx264", "bitrate", "preset" ]): return "encoder_error" # 文件访问错误 if any(keyword in error_msg_lower for keyword in [ "no such file", "permission denied", "access denied", "file not found" ]): return "file_error" return "unknown_error" def try_compatibility_fallback( input_path: str, output_path: str, start_time: str, end_time: str, timestamp: str ) -> bool: """ 尝试兼容性fallback方案(解决滤镜链问题) Args: input_path: 输入路径 output_path: 输出路径 start_time: 开始时间 end_time: 结束时间 timestamp: 时间戳 Returns: bool: 是否成功 """ # 兼容性模式:避免所有可能的滤镜链问题 fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", "-i", input_path, "-ss", start_time, "-to", end_time, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", # 明确指定像素格式 "-preset", "fast", "-crf", "23", "-ar", "44100", "-ac", "2", # 标准化音频 "-avoid_negative_ts", "make_zero", "-movflags", "+faststart", "-max_muxing_queue_size", "1024", # 增加缓冲区大小 output_path ] return execute_simple_command(fallback_cmd, timestamp, "兼容性模式") def try_software_fallback( input_path: str, output_path: str, start_time: str, end_time: str, timestamp: str ) -> bool: """ 尝试软件编码fallback方案 Args: input_path: 输入路径 output_path: 输出路径 start_time: 开始时间 end_time: 结束时间 timestamp: 时间戳 Returns: bool: 是否成功 """ # 纯软件编码 fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", "-i", input_path, "-ss", start_time, "-to", end_time, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-preset", "fast", "-crf", "23", "-ar", "44100", "-ac", "2", "-avoid_negative_ts", "make_zero", "-movflags", "+faststart", output_path ] return execute_simple_command(fallback_cmd, timestamp, "软件编码") def try_basic_fallback( input_path: str, output_path: str, start_time: str, end_time: str, timestamp: str ) -> bool: """ 尝试基本编码fallback方案 Args: input_path: 输入路径 output_path: 输出路径 start_time: 开始时间 end_time: 结束时间 timestamp: 时间戳 Returns: bool: 是否成功 """ # 最基本的编码参数 fallback_cmd = [ "ffmpeg", "-y", "-hide_banner", "-loglevel", "error", "-i", input_path, "-ss", start_time, "-to", end_time, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-preset", "ultrafast", # 最快速度 "-crf", "28", # 稍微降低质量 "-avoid_negative_ts", "make_zero", output_path ] return execute_simple_command(fallback_cmd, timestamp, "基本编码") def execute_simple_command(cmd: List[str], timestamp: str, method_name: str) -> bool: """ 执行简单的ffmpeg命令 Args: cmd: 命令列表 timestamp: 时间戳 method_name: 方法名称 Returns: bool: 是否成功 """ try: logger.debug(f"执行{method_name}命令: {' '.join(cmd)}") is_windows = os.name == 'nt' process_kwargs = { "stdout": subprocess.PIPE, "stderr": subprocess.PIPE, "text": True, "check": True } if is_windows: process_kwargs["encoding"] = 'utf-8' subprocess.run(cmd, **process_kwargs) output_path = cmd[-1] # 输出路径总是最后一个参数 if os.path.exists(output_path) and os.path.getsize(output_path) > 0: logger.info(f"✓ {method_name}成功: {timestamp}") return True else: logger.error(f"{method_name}失败,输出文件无效: {output_path}") return False except subprocess.CalledProcessError as e: error_msg = e.stderr if e.stderr else str(e) logger.error(f"{method_name}失败: {error_msg}") return False except Exception as e: logger.error(f"{method_name}异常: {str(e)}") return False def try_fallback_encoding( input_path: str, output_path: str, start_time: str, end_time: str, timestamp: str ) -> bool: """ 尝试fallback编码方案(通用方案) Args: input_path: 输入路径 output_path: 输出路径 start_time: 开始时间 end_time: 结束时间 timestamp: 时间戳 Returns: bool: 是否成功 """ # 最简单的软件编码命令 fallback_cmd = [ "ffmpeg", "-y", "-i", input_path, "-ss", start_time, "-to", end_time, "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-preset", "ultrafast", # 最快速度 "-crf", "28", # 稍微降低质量以提高兼容性 "-avoid_negative_ts", "make_zero", "-movflags", "+faststart", output_path ] return execute_simple_command(fallback_cmd, timestamp, "通用Fallback") def _process_narration_only_segment( video_origin_path: str, script_item: Dict, tts_map: Dict, output_dir: str, encoder_config: Dict, hwaccel_args: List[str] ) -> Optional[str]: """ 处理OST=0的纯解说片段 - 根据TTS音频时长动态裁剪 - 移除原声,生成静音视频 """ _id = script_item["_id"] timestamp = script_item["timestamp"] # 获取对应的TTS结果 tts_item = tts_map.get(_id) if not tts_item: logger.error(f"未找到片段 {_id} 的TTS结果") return None # 解析起始时间,使用TTS音频时长计算结束时间 start_time, _ = parse_timestamp(timestamp) duration = tts_item["duration"] calculated_end_time = calculate_end_time(start_time, duration, extra_seconds=0) # 转换为FFmpeg兼容的时间格式 ffmpeg_start_time = start_time.replace(',', '.') ffmpeg_end_time = calculated_end_time.replace(',', '.') # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') output_filename = f"ost0_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 移除音频 cmd = _build_ffmpeg_command_with_audio_control( video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time, encoder_config, hwaccel_args, remove_audio=True ) # 执行命令 success = execute_ffmpeg_with_fallback( cmd, timestamp, video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time ) return output_path if success else None def _process_original_audio_segment( video_origin_path: str, script_item: Dict, output_dir: str, encoder_config: Dict, hwaccel_args: List[str] ) -> Optional[str]: """ 处理OST=1的纯原声片段 - 严格按照脚本timestamp精确裁剪 - 保持原声不变 """ _id = script_item["_id"] timestamp = script_item["timestamp"] # 严格按照timestamp进行裁剪 start_time, end_time = parse_timestamp(timestamp) # 转换为FFmpeg兼容的时间格式 ffmpeg_start_time = start_time.replace(',', '.') ffmpeg_end_time = end_time.replace(',', '.') # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = end_time.replace(':', '-').replace(',', '-') output_filename = f"ost1_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 cmd = _build_ffmpeg_command_with_audio_control( video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time, encoder_config, hwaccel_args, remove_audio=False ) # 执行命令 success = execute_ffmpeg_with_fallback( cmd, timestamp, video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time ) return output_path if success else None def _process_mixed_segment( video_origin_path: str, script_item: Dict, tts_map: Dict, output_dir: str, encoder_config: Dict, hwaccel_args: List[str] ) -> Optional[str]: """ 处理OST=2的解说+原声混合片段 - 根据TTS音频时长动态裁剪 - 保持原声,确保视频时长等于TTS音频时长 """ _id = script_item["_id"] timestamp = script_item["timestamp"] # 获取对应的TTS结果 tts_item = tts_map.get(_id) if not tts_item: logger.error(f"未找到片段 {_id} 的TTS结果") return None # 解析起始时间,使用TTS音频时长计算结束时间 start_time, _ = parse_timestamp(timestamp) duration = tts_item["duration"] calculated_end_time = calculate_end_time(start_time, duration, extra_seconds=0) # 转换为FFmpeg兼容的时间格式 ffmpeg_start_time = start_time.replace(',', '.') ffmpeg_end_time = calculated_end_time.replace(',', '.') # 生成输出文件名 safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') output_filename = f"ost2_vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 - 保持原声 cmd = _build_ffmpeg_command_with_audio_control( video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time, encoder_config, hwaccel_args, remove_audio=False ) # 执行命令 success = execute_ffmpeg_with_fallback( cmd, timestamp, video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time ) return output_path if success else None def _build_ffmpeg_command_with_audio_control( input_path: str, output_path: str, start_time: str, end_time: str, encoder_config: Dict[str, str], hwaccel_args: List[str] = None, remove_audio: bool = False ) -> List[str]: """ 构建支持音频控制的FFmpeg命令 Args: input_path: 输入视频路径 output_path: 输出视频路径 start_time: 开始时间 end_time: 结束时间 encoder_config: 编码器配置 hwaccel_args: 硬件加速参数 remove_audio: 是否移除音频(OST=0时为True) Returns: List[str]: ffmpeg命令列表 """ cmd = ["ffmpeg", "-y"] # 硬件加速设置(参考原有逻辑) if encoder_config["video_codec"] == "h264_nvenc": # 对于NVENC,不使用硬件解码以避免滤镜链问题 pass elif hwaccel_args: cmd.extend(hwaccel_args) # 输入文件 cmd.extend(["-i", input_path]) # 时间范围 cmd.extend(["-ss", start_time, "-to", end_time]) # 视频编码器设置 cmd.extend(["-c:v", encoder_config["video_codec"]]) # 音频处理 if remove_audio: # OST=0: 移除音频 cmd.extend(["-an"]) # -an 表示不包含音频流 logger.debug("OST=0: 移除音频流") else: # OST=1,2: 保持原声 cmd.extend(["-c:a", encoder_config["audio_codec"]]) cmd.extend(["-ar", "44100", "-ac", "2"]) logger.debug("OST=1/2: 保持原声") # 像素格式 cmd.extend(["-pix_fmt", encoder_config["pixel_format"]]) # 质量和预设参数(参考原有逻辑) if encoder_config["video_codec"] == "h264_nvenc": cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-cq", encoder_config["quality_value"]]) cmd.extend(["-profile:v", "main"]) elif encoder_config["video_codec"] == "h264_amf": cmd.extend(["-quality", encoder_config["preset"]]) cmd.extend(["-qp_i", encoder_config["quality_value"]]) elif encoder_config["video_codec"] == "h264_qsv": cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-global_quality", encoder_config["quality_value"]]) elif encoder_config["video_codec"] == "h264_videotoolbox": cmd.extend(["-profile:v", "high"]) cmd.extend(["-b:v", encoder_config["quality_value"]]) else: # 软件编码器(libx264) cmd.extend(["-preset", encoder_config["preset"]]) cmd.extend(["-crf", encoder_config["quality_value"]]) # 优化参数 cmd.extend(["-avoid_negative_ts", "make_zero"]) cmd.extend(["-movflags", "+faststart"]) # 输出文件 cmd.append(output_path) return cmd def clip_video_unified( video_origin_path: str, script_list: List[Dict], tts_results: List[Dict], output_dir: Optional[str] = None, task_id: Optional[str] = None ) -> Dict[str, str]: """ 基于OST类型的统一视频裁剪策略 - 消除双重裁剪问题 Args: video_origin_path: 原始视频的路径 script_list: 完整的脚本列表,包含所有片段信息 tts_results: TTS结果列表,仅包含OST=0和OST=2的片段 output_dir: 输出目录路径,默认为None时会自动生成 task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 Returns: Dict[str, str]: 片段ID到裁剪后视频路径的映射 """ # 检查视频文件是否存在 if not os.path.exists(video_origin_path): raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") # 如果未提供task_id,则根据输入生成一个唯一ID if task_id is None: content_for_hash = f"{video_origin_path}_{json.dumps(script_list)}" task_id = hashlib.md5(content_for_hash.encode()).hexdigest() # 设置输出目录 if output_dir is None: output_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "storage", "temp", "clip_video_unified", task_id ) # 确保输出目录存在 Path(output_dir).mkdir(parents=True, exist_ok=True) # 创建TTS结果的快速查找映射 tts_map = {item['_id']: item for item in tts_results} # 获取硬件加速支持 hwaccel_type = check_hardware_acceleration() hwaccel_args = [] if hwaccel_type: hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args() hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() logger.info(f"🚀 使用硬件加速: {hwaccel_type} ({hwaccel_info.get('message', '')})") else: logger.info("🔧 使用软件编码") # 获取编码器配置 encoder_config = get_safe_encoder_config(hwaccel_type) logger.debug(f"编码器配置: {encoder_config}") # 统计信息 total_clips = len(script_list) result = {} failed_clips = [] success_count = 0 logger.info(f"📹 开始统一视频裁剪,总共{total_clips}个片段") for i, script_item in enumerate(script_list, 1): _id = script_item.get("_id") ost = script_item.get("OST", 0) timestamp = script_item["timestamp"] logger.info(f"📹 [{i}/{total_clips}] 处理片段 ID:{_id}, OST:{ost}, 时间戳:{timestamp}") try: if ost == 0: # 纯解说片段 output_path = _process_narration_only_segment( video_origin_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) elif ost == 1: # 纯原声片段 output_path = _process_original_audio_segment( video_origin_path, script_item, output_dir, encoder_config, hwaccel_args ) elif ost == 2: # 解说+原声混合片段 output_path = _process_mixed_segment( video_origin_path, script_item, tts_map, output_dir, encoder_config, hwaccel_args ) else: logger.warning(f"未知的OST类型: {ost},跳过片段 {_id}") continue if output_path and os.path.exists(output_path) and os.path.getsize(output_path) > 0: result[_id] = output_path success_count += 1 logger.info(f"✅ [{i}/{total_clips}] 片段处理成功: OST={ost}, ID={_id}") else: failed_clips.append(f"ID:{_id}, OST:{ost}") logger.error(f"❌ [{i}/{total_clips}] 片段处理失败: OST={ost}, ID={_id}") except Exception as e: failed_clips.append(f"ID:{_id}, OST:{ost}") logger.error(f"❌ [{i}/{total_clips}] 片段处理异常: OST={ost}, ID={_id}, 错误: {str(e)}") # 最终统计 logger.info(f"📊 统一视频裁剪完成: 成功 {success_count}/{total_clips}, 失败 {len(failed_clips)}") # 检查是否有失败的片段 if failed_clips: logger.warning(f"⚠️ 以下片段处理失败: {failed_clips}") if len(failed_clips) == total_clips: raise RuntimeError("所有视频片段处理都失败了,请检查视频文件和ffmpeg配置") elif len(failed_clips) > total_clips / 2: logger.warning(f"⚠️ 超过一半的片段处理失败 ({len(failed_clips)}/{total_clips}),请检查硬件加速配置") if success_count > 0: logger.info(f"🎉 统一视频裁剪任务完成! 输出目录: {output_dir}") return result def clip_video( video_origin_path: str, tts_result: List[Dict], output_dir: Optional[str] = None, task_id: Optional[str] = None ) -> Dict[str, str]: """ 根据时间戳裁剪视频 - 优化版本,增强Windows兼容性和错误处理 Args: video_origin_path: 原始视频的路径 tts_result: 包含时间戳和持续时间信息的列表 output_dir: 输出目录路径,默认为None时会自动生成 task_id: 任务ID,用于生成唯一的输出目录,默认为None时会自动生成 Returns: Dict[str, str]: 时间戳到裁剪后视频路径的映射 """ # 检查视频文件是否存在 if not os.path.exists(video_origin_path): raise FileNotFoundError(f"视频文件不存在: {video_origin_path}") # 如果未提供task_id,则根据输入生成一个唯一ID if task_id is None: content_for_hash = f"{video_origin_path}_{json.dumps(tts_result)}" task_id = hashlib.md5(content_for_hash.encode()).hexdigest() # 设置输出目录 if output_dir is None: output_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "storage", "temp", "clip_video", task_id ) # 确保输出目录存在 Path(output_dir).mkdir(parents=True, exist_ok=True) # 获取硬件加速支持 hwaccel_type = check_hardware_acceleration() hwaccel_args = [] if hwaccel_type: hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args() hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() logger.info(f"🚀 使用硬件加速: {hwaccel_type} ({hwaccel_info.get('message', '')})") else: logger.info("🔧 使用软件编码") # 获取编码器配置 encoder_config = get_safe_encoder_config(hwaccel_type) logger.debug(f"编码器配置: {encoder_config}") # 统计信息 total_clips = len(tts_result) result = {} failed_clips = [] success_count = 0 logger.info(f"📹 开始裁剪视频,总共{total_clips}个片段") for i, item in enumerate(tts_result, 1): _id = item.get("_id", item.get("timestamp", "unknown")) timestamp = item["timestamp"] start_time, _ = parse_timestamp(timestamp) # 根据持续时间计算真正的结束时间(加上1秒余量) duration = item["duration"] # 时长合理性检查和修正 if duration <= 0 or duration > 300: # 超过5分钟认为不合理 logger.warning(f"检测到异常时长 {duration}秒,片段: {timestamp}") # 尝试从时间戳计算实际时长 try: start_time_str, end_time_str = timestamp.split('-') # 解析开始时间 if ',' in start_time_str: time_part, ms_part = start_time_str.split(',') h1, m1, s1 = map(int, time_part.split(':')) ms1 = int(ms_part) else: h1, m1, s1 = map(int, start_time_str.split(':')) ms1 = 0 # 解析结束时间 if ',' in end_time_str: time_part, ms_part = end_time_str.split(',') h2, m2, s2 = map(int, time_part.split(':')) ms2 = int(ms_part) else: h2, m2, s2 = map(int, end_time_str.split(':')) ms2 = 0 # 计算实际时长 start_total_ms = (h1 * 3600 + m1 * 60 + s1) * 1000 + ms1 end_total_ms = (h2 * 3600 + m2 * 60 + s2) * 1000 + ms2 actual_duration = (end_total_ms - start_total_ms) / 1000.0 if actual_duration > 0 and actual_duration <= 300: duration = actual_duration logger.info(f"使用时间戳计算的实际时长: {duration:.3f}秒") else: duration = 5.0 # 默认5秒 logger.warning(f"时间戳计算也异常,使用默认时长: {duration}秒") except Exception as e: duration = 5.0 # 默认5秒 logger.warning(f"时长修正失败,使用默认时长: {duration}秒, 错误: {str(e)}") calculated_end_time = calculate_end_time(start_time, duration) # 转换为FFmpeg兼容的时间格式(逗号替换为点) ffmpeg_start_time = start_time.replace(',', '.') ffmpeg_end_time = calculated_end_time.replace(',', '.') # 格式化输出文件名(使用连字符替代冒号和逗号) safe_start_time = start_time.replace(':', '-').replace(',', '-') safe_end_time = calculated_end_time.replace(':', '-').replace(',', '-') output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4" output_path = os.path.join(output_dir, output_filename) # 构建FFmpeg命令 ffmpeg_cmd = build_ffmpeg_command( video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time, encoder_config, hwaccel_args ) # 执行FFmpeg命令 logger.info(f"📹 [{i}/{total_clips}] 裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}") success = execute_ffmpeg_with_fallback( ffmpeg_cmd, timestamp, video_origin_path, output_path, ffmpeg_start_time, ffmpeg_end_time ) if success: result[_id] = output_path success_count += 1 logger.info(f"✅ [{i}/{total_clips}] 片段裁剪成功: {timestamp}") else: failed_clips.append(timestamp) logger.error(f"❌ [{i}/{total_clips}] 片段裁剪失败: {timestamp}") # 最终统计 logger.info(f"📊 视频裁剪完成: 成功 {success_count}/{total_clips}, 失败 {len(failed_clips)}") # 检查是否有失败的片段 if failed_clips: logger.warning(f"⚠️ 以下片段裁剪失败: {failed_clips}") if len(failed_clips) == total_clips: raise RuntimeError("所有视频片段裁剪都失败了,请检查视频文件和ffmpeg配置") elif len(failed_clips) > total_clips / 2: logger.warning(f"⚠️ 超过一半的片段裁剪失败 ({len(failed_clips)}/{total_clips}),请检查硬件加速配置") if success_count > 0: logger.info(f"🎉 视频裁剪任务完成! 输出目录: {output_dir}") return result if __name__ == "__main__": video_origin_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/qyn2-2无片头片尾.mp4" tts_result = [{'timestamp': '00:00:00-00:01:15', 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 'duration': 25.55, 'text': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!'}, {'timestamp': '00:01:15-00:04:40', 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 'duration': 13.488, 'text': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…'}, {'timestamp': '00:04:58-00:05:45', 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 'duration': 21.363, 'text': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!'}, {'timestamp': '00:05:45-00:06:00', 'audio_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', 'subtitle_file': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', 'duration': 7.675, 'text': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!'}] subclip_path_videos = { '00:00:00-00:01:15': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-00-00-00-01-15.mp4', '00:01:15-00:04:40': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-01-15-00-04-40.mp4', '00:04:41-00:04:58': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-41-00-04-58.mp4', '00:04:58-00:05:45': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-04-58-00-05-45.mp4', '00:05:45-00:06:00': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-05-45-00-06-00.mp4', '00:06:00-00:06:03': '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/6e7e343c7592c7d6f9a9636b55000f23/vid-00-06-00-00-06-03.mp4', } # 使用方法示例 try: result = clip_video(video_origin_path, tts_result, subclip_path_videos) print("裁剪结果:") print(json.dumps(result, indent=4, ensure_ascii=False)) except Exception as e: print(f"发生错误: {e}") ================================================ FILE: app/services/generate_narration_script.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : 生成介绍文案 @Author : Viccy同学 @Date : 2025/5/8 上午11:33 ''' import json import os import traceback import asyncio from openai import OpenAI from loguru import logger # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import generate_narration as generate_narration_new # 导入新的提示词管理系统 from app.services.prompts import PromptManager def parse_frame_analysis_to_markdown(json_file_path): """ 解析视频帧分析JSON文件并转换为Markdown格式 :param json_file_path: JSON文件路径 :return: Markdown格式的字符串 """ # 检查文件是否存在 if not os.path.exists(json_file_path): return f"错误: 文件 {json_file_path} 不存在" try: # 读取JSON文件 with open(json_file_path, 'r', encoding='utf-8') as file: data = json.load(file) # 初始化Markdown字符串 markdown = "" # 获取总结和帧观察数据 summaries = data.get('overall_activity_summaries', []) frame_observations = data.get('frame_observations', []) # 按批次组织数据 batch_frames = {} for frame in frame_observations: batch_index = frame.get('batch_index') if batch_index not in batch_frames: batch_frames[batch_index] = [] batch_frames[batch_index].append(frame) # 生成Markdown内容 for i, summary in enumerate(summaries, 1): batch_index = summary.get('batch_index') time_range = summary.get('time_range', '') batch_summary = summary.get('summary', '') markdown += f"## 片段 {i}\n" markdown += f"- 时间范围:{time_range}\n" # 添加片段描述 markdown += f"- 片段描述:{batch_summary}\n" if batch_summary else f"- 片段描述:\n" markdown += "- 详细描述:\n" # 添加该批次的帧观察详情 frames = batch_frames.get(batch_index, []) for frame in frames: timestamp = frame.get('timestamp', '') observation = frame.get('observation', '') # 直接使用原始文本,不进行分割 markdown += f" - {timestamp}: {observation}\n" if observation else f" - {timestamp}: \n" markdown += "\n" return markdown except Exception as e: return f"处理JSON文件时出错: {traceback.format_exc()}" def generate_narration(markdown_content, api_key, base_url, model): """ 调用大模型API根据视频帧分析的Markdown内容生成解说文案 - 已重构为使用新的LLM服务架构 :param markdown_content: Markdown格式的视频帧分析内容 :param api_key: API密钥 :param base_url: API基础URL :param model: 使用的模型名称 :return: 生成的解说文案 """ try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构生成解说文案") result = generate_narration_new(markdown_content, api_key, base_url, model) return result except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现以确保兼容性 return _generate_narration_legacy(markdown_content, api_key, base_url, model) def _generate_narration_legacy(markdown_content, api_key, base_url, model): """ 旧的解说文案生成实现 - 保留作为备用方案 :param markdown_content: Markdown格式的视频帧分析内容 :param api_key: API密钥 :param base_url: API基础URL :param model: 使用的模型名称 :return: 生成的解说文案 """ try: # 使用新的提示词管理系统构建提示词 prompt = PromptManager.get_prompt( category="documentary", name="narration_generation", parameters={ "video_frame_description": markdown_content } ) # 使用OpenAI SDK初始化客户端 client = OpenAI( api_key=api_key, base_url=base_url ) # 使用SDK发送请求 if model not in ["deepseek-reasoner"]: # deepseek-reasoner 不支持 json 输出 response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"}, {"role": "user", "content": prompt} ], temperature=1.5, response_format={"type": "json_object"}, ) # 提取生成的文案 if response.choices and len(response.choices) > 0: narration_script = response.choices[0].message.content # 打印消耗的tokens logger.debug(f"消耗的tokens: {response.usage.total_tokens}") return narration_script else: return "生成解说文案失败: 未获取到有效响应" else: # 不支持 json 输出,需要多一步处理 ```json ``` 的步骤 response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "你是一名专业的短视频解说文案撰写专家。"}, {"role": "user", "content": prompt} ], temperature=1.5, ) # 提取生成的文案 if response.choices and len(response.choices) > 0: narration_script = response.choices[0].message.content # 打印消耗的tokens logger.debug(f"文案消耗的tokens: {response.usage.total_tokens}") # 清理 narration_script 字符串前后的 ```json ``` 字符串 narration_script = narration_script.replace("```json", "").replace("```", "") return narration_script else: return "生成解说文案失败: 未获取到有效响应" except Exception as e: return f"调用API生成解说文案时出错: {traceback.format_exc()}" if __name__ == '__main__': text_provider = 'openai' text_api_key = "sk-xxx" text_model = "deepseek-reasoner" text_base_url = "https://api.deepseek.com" video_frame_description_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_1139.json" # 测试新的JSON文件 test_file_path = "/Users/apple/Desktop/home/NarratoAI/storage/temp/analysis/frame_analysis_20250508_2258.json" markdown_output = parse_frame_analysis_to_markdown(test_file_path) # print(markdown_output) # 输出到文件以便检查格式 output_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/家里家外1-5.md" with open(output_file, 'w', encoding='utf-8') as f: f.write(markdown_output) # print(f"\n已将Markdown输出保存到: {output_file}") # # 生成解说文案 # narration = generate_narration( # markdown_output, # text_api_key, # base_url=text_base_url, # model=text_model # ) # # # 保存解说文案 # print(narration) # print(type(narration)) # narration_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/final_narration_script.json" # with open(narration_file, 'w', encoding='utf-8') as f: # f.write(narration) # print(f"\n已将解说文案保存到: {narration_file}") ================================================ FILE: app/services/generate_video.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : generate_video @Author : Viccy同学 @Date : 2025/5/7 上午11:55 ''' import os import traceback import tempfile from typing import Optional, Dict, Any from loguru import logger from moviepy import ( VideoFileClip, AudioFileClip, CompositeAudioClip, CompositeVideoClip, TextClip, afx ) from moviepy.video.tools.subtitles import SubtitlesClip from PIL import ImageFont from app.utils import utils from app.models.schema import AudioVolumeDefaults from app.services.audio_normalizer import AudioNormalizer, normalize_audio_for_mixing def is_valid_subtitle_file(subtitle_path: str) -> bool: """ 检查字幕文件是否有效 参数: subtitle_path: 字幕文件路径 返回: bool: 如果字幕文件存在且包含有效内容则返回True,否则返回False """ if not subtitle_path or not os.path.exists(subtitle_path): return False try: with open(subtitle_path, 'r', encoding='utf-8') as f: content = f.read().strip() # 检查文件是否为空 if not content: return False # 检查是否包含时间戳格式(SRT格式的基本特征) # SRT格式应该包含类似 "00:00:00,000 --> 00:00:00,000" 的时间戳 import re time_pattern = r'\d{2}:\d{2}:\d{2},\d{3}\s*-->\s*\d{2}:\d{2}:\d{2},\d{3}' if not re.search(time_pattern, content): return False return True except Exception as e: logger.warning(f"检查字幕文件时出错: {str(e)}") return False def merge_materials( video_path: str, audio_path: str, output_path: str, subtitle_path: Optional[str] = None, bgm_path: Optional[str] = None, options: Optional[Dict[str, Any]] = None ) -> str: """ 合并视频、音频、BGM和字幕素材生成最终视频 参数: video_path: 视频文件路径 audio_path: 音频文件路径 output_path: 输出文件路径 subtitle_path: 字幕文件路径,可选 bgm_path: 背景音乐文件路径,可选 options: 其他选项配置,可包含以下字段: - voice_volume: 人声音量,默认1.0 - bgm_volume: 背景音乐音量,默认0.3 - original_audio_volume: 原始音频音量,默认0.0 - keep_original_audio: 是否保留原始音频,默认False - subtitle_font: 字幕字体,默认None,系统会使用默认字体 - subtitle_font_size: 字幕字体大小,默认40 - subtitle_color: 字幕颜色,默认白色 - subtitle_bg_color: 字幕背景颜色,默认透明 - subtitle_position: 字幕位置,可选值'bottom', 'top', 'center',默认'bottom' - custom_position: 自定义位置 - stroke_color: 描边颜色,默认黑色 - stroke_width: 描边宽度,默认1 - threads: 处理线程数,默认2 - fps: 输出帧率,默认30 - subtitle_enabled: 是否启用字幕,默认True 返回: 输出视频的路径 """ # 合并选项默认值 if options is None: options = {} # 设置默认参数值 - 使用统一的音量配置 voice_volume = options.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME) bgm_volume = options.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME) # 修复bug: 将原声音量默认值从0.0改为0.7,确保短剧解说模式下原片音量正常 original_audio_volume = options.get('original_audio_volume', AudioVolumeDefaults.ORIGINAL_VOLUME) keep_original_audio = options.get('keep_original_audio', True) # 默认保留原声 subtitle_font = options.get('subtitle_font', '') subtitle_font_size = options.get('subtitle_font_size', 40) subtitle_color = options.get('subtitle_color', '#FFFFFF') subtitle_bg_color = options.get('subtitle_bg_color', 'transparent') subtitle_position = options.get('subtitle_position', 'bottom') custom_position = options.get('custom_position', 70) stroke_color = options.get('stroke_color', '#000000') stroke_width = options.get('stroke_width', 1) threads = options.get('threads', 2) fps = options.get('fps', 30) subtitle_enabled = options.get('subtitle_enabled', True) # 配置日志 - 便于调试问题 logger.info(f"音量配置详情:") logger.info(f" - 配音音量: {voice_volume}") logger.info(f" - 背景音乐音量: {bgm_volume}") logger.info(f" - 原声音量: {original_audio_volume}") logger.info(f" - 是否保留原声: {keep_original_audio}") logger.info(f"字幕配置详情:") logger.info(f" - 是否启用字幕: {subtitle_enabled}") logger.info(f" - 字幕文件路径: {subtitle_path}") # 音量参数验证 def validate_volume(volume, name): if not (AudioVolumeDefaults.MIN_VOLUME <= volume <= AudioVolumeDefaults.MAX_VOLUME): logger.warning(f"{name}音量 {volume} 超出有效范围 [{AudioVolumeDefaults.MIN_VOLUME}, {AudioVolumeDefaults.MAX_VOLUME}],将被限制") return max(AudioVolumeDefaults.MIN_VOLUME, min(volume, AudioVolumeDefaults.MAX_VOLUME)) return volume voice_volume = validate_volume(voice_volume, "配音") bgm_volume = validate_volume(bgm_volume, "背景音乐") original_audio_volume = validate_volume(original_audio_volume, "原声") # 处理透明背景色问题 - MoviePy 2.1.1不支持'transparent'值 if subtitle_bg_color == 'transparent': subtitle_bg_color = None # None在新版MoviePy中表示透明背景 # 创建输出目录(如果不存在) output_dir = os.path.dirname(output_path) os.makedirs(output_dir, exist_ok=True) logger.info(f"开始合并素材...") logger.info(f" ① 视频: {video_path}") logger.info(f" ② 音频: {audio_path}") if subtitle_path: logger.info(f" ③ 字幕: {subtitle_path}") if bgm_path: logger.info(f" ④ 背景音乐: {bgm_path}") logger.info(f" ⑤ 输出: {output_path}") # 加载视频 try: video_clip = VideoFileClip(video_path) logger.info(f"视频尺寸: {video_clip.size[0]}x{video_clip.size[1]}, 时长: {video_clip.duration}秒") # 提取视频原声(如果需要) original_audio = None if keep_original_audio and original_audio_volume > 0: try: original_audio = video_clip.audio if original_audio: # 关键修复:只有当音量不为1.0时才进行音量调整,保持原声音量不变 if abs(original_audio_volume - 1.0) > 0.001: # 使用小的容差值比较浮点数 original_audio = original_audio.with_effects([afx.MultiplyVolume(original_audio_volume)]) logger.info(f"已提取视频原声,音量调整为: {original_audio_volume}") else: logger.info("已提取视频原声,保持原始音量不变") else: logger.warning("视频没有音轨,无法提取原声") except Exception as e: logger.error(f"提取视频原声失败: {str(e)}") original_audio = None # 移除原始音轨,稍后会合并新的音频 video_clip = video_clip.without_audio() except Exception as e: logger.error(f"加载视频失败: {str(e)}") raise # 处理背景音乐和所有音频轨道合成 audio_tracks = [] # 智能音量调整(可选功能) if AudioVolumeDefaults.ENABLE_SMART_VOLUME and audio_path and os.path.exists(audio_path) and original_audio is not None: try: normalizer = AudioNormalizer() temp_dir = tempfile.mkdtemp() temp_original_path = os.path.join(temp_dir, "temp_original.wav") # 保存原声到临时文件进行分析 original_audio.write_audiofile(temp_original_path, verbose=False, logger=None) # 计算智能音量调整 tts_adjustment, original_adjustment = normalizer.calculate_volume_adjustment( audio_path, temp_original_path ) # 应用智能调整,但保留用户设置的相对比例 smart_voice_volume = voice_volume * tts_adjustment smart_original_volume = original_audio_volume * original_adjustment # 限制音量范围,避免过度调整 smart_voice_volume = max(0.1, min(1.5, smart_voice_volume)) smart_original_volume = max(0.1, min(2.0, smart_original_volume)) voice_volume = smart_voice_volume original_audio_volume = smart_original_volume logger.info(f"智能音量调整 - TTS: {voice_volume:.2f}, 原声: {original_audio_volume:.2f}") # 清理临时文件 import shutil shutil.rmtree(temp_dir) except Exception as e: logger.warning(f"智能音量分析失败,使用原始设置: {e}") # 先添加主音频(配音) if audio_path and os.path.exists(audio_path): try: voice_audio = AudioFileClip(audio_path).with_effects([afx.MultiplyVolume(voice_volume)]) audio_tracks.append(voice_audio) logger.info(f"已添加配音音频,音量: {voice_volume}") except Exception as e: logger.error(f"加载配音音频失败: {str(e)}") # 添加原声(如果需要) if original_audio is not None: # 重新应用调整后的音量(因为original_audio已经应用了一次音量) # 计算需要的额外调整 current_volume_in_original = 1.0 # original_audio中已应用的音量 additional_adjustment = original_audio_volume / current_volume_in_original adjusted_original_audio = original_audio.with_effects([afx.MultiplyVolume(additional_adjustment)]) audio_tracks.append(adjusted_original_audio) logger.info(f"已添加视频原声,最终音量: {original_audio_volume}") # 添加背景音乐(如果有) if bgm_path and os.path.exists(bgm_path): try: bgm_clip = AudioFileClip(bgm_path).with_effects([ afx.MultiplyVolume(bgm_volume), afx.AudioFadeOut(3), afx.AudioLoop(duration=video_clip.duration), ]) audio_tracks.append(bgm_clip) logger.info(f"已添加背景音乐,音量: {bgm_volume}") except Exception as e: logger.error(f"添加背景音乐失败: \n{traceback.format_exc()}") # 合成最终的音频轨道 if audio_tracks: final_audio = CompositeAudioClip(audio_tracks) video_clip = video_clip.with_audio(final_audio) logger.info(f"已合成所有音频轨道,共{len(audio_tracks)}个") else: logger.warning("没有可用的音频轨道,输出视频将没有声音") # 处理字体路径 font_path = None if subtitle_path and subtitle_font: font_path = os.path.join(utils.font_dir(), subtitle_font) if os.name == "nt": font_path = font_path.replace("\\", "/") logger.info(f"使用字体: {font_path}") # 处理视频尺寸 video_width, video_height = video_clip.size # 字幕处理函数 def create_text_clip(subtitle_item): """创建单个字幕片段""" phrase = subtitle_item[1] max_width = video_width * 0.9 # 如果有字体路径,进行文本换行处理 wrapped_txt = phrase txt_height = 0 if font_path: wrapped_txt, txt_height = wrap_text( phrase, max_width=max_width, font=font_path, fontsize=subtitle_font_size ) # 创建文本片段 try: _clip = TextClip( text=wrapped_txt, font=font_path, font_size=subtitle_font_size, color=subtitle_color, bg_color=subtitle_bg_color, # 这里已经在前面处理过,None表示透明 stroke_color=stroke_color, stroke_width=stroke_width, ) except Exception as e: logger.error(f"创建字幕片段失败: {str(e)}, 使用简化参数重试") # 如果上面的方法失败,尝试使用更简单的参数 _clip = TextClip( text=wrapped_txt, font=font_path, font_size=subtitle_font_size, color=subtitle_color, ) # 设置字幕时间 duration = subtitle_item[0][1] - subtitle_item[0][0] _clip = _clip.with_start(subtitle_item[0][0]) _clip = _clip.with_end(subtitle_item[0][1]) _clip = _clip.with_duration(duration) # 设置字幕位置 if subtitle_position == "bottom": _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h)) elif subtitle_position == "top": _clip = _clip.with_position(("center", video_height * 0.05)) elif subtitle_position == "custom": margin = 10 max_y = video_height - _clip.h - margin min_y = margin custom_y = (video_height - _clip.h) * (custom_position / 100) custom_y = max( min_y, min(custom_y, max_y) ) _clip = _clip.with_position(("center", custom_y)) else: # center _clip = _clip.with_position(("center", "center")) return _clip # 创建TextClip工厂函数 def make_textclip(text): return TextClip( text=text, font=font_path, font_size=subtitle_font_size, color=subtitle_color, ) # 处理字幕 - 修复字幕开关bug和空字幕文件问题 if subtitle_enabled and subtitle_path: if is_valid_subtitle_file(subtitle_path): logger.info("字幕已启用,开始处理字幕文件") try: # 加载字幕文件 sub = SubtitlesClip( subtitles=subtitle_path, encoding="utf-8", make_textclip=make_textclip ) # 创建每个字幕片段 text_clips = [] for item in sub.subtitles: clip = create_text_clip(subtitle_item=item) text_clips.append(clip) # 合成视频和字幕 video_clip = CompositeVideoClip([video_clip, *text_clips]) logger.info(f"已添加{len(text_clips)}个字幕片段") except Exception as e: logger.error(f"处理字幕失败: \n{traceback.format_exc()}") logger.warning("字幕处理失败,继续生成无字幕视频") else: logger.warning(f"字幕文件无效或为空: {subtitle_path},跳过字幕处理") elif not subtitle_enabled: logger.info("字幕已禁用,跳过字幕处理") elif not subtitle_path: logger.info("未提供字幕文件路径,跳过字幕处理") # 导出最终视频 try: video_clip.write_videofile( output_path, audio_codec="aac", temp_audiofile_path=output_dir, threads=threads, fps=fps, ) logger.success(f"素材合并完成: {output_path}") except Exception as e: logger.error(f"导出视频失败: {str(e)}") raise finally: # 释放资源 video_clip.close() del video_clip return output_path def wrap_text(text, max_width, font="Arial", fontsize=60): """ 文本换行函数,使长文本适应指定宽度 参数: text: 需要换行的文本 max_width: 最大宽度(像素) font: 字体路径 fontsize: 字体大小 返回: 换行后的文本和文本高度 """ # 创建ImageFont对象 try: font_obj = ImageFont.truetype(font, fontsize) except: # 如果无法加载指定字体,使用默认字体 font_obj = ImageFont.load_default() def get_text_size(inner_text): inner_text = inner_text.strip() left, top, right, bottom = font_obj.getbbox(inner_text) return right - left, bottom - top width, height = get_text_size(text) if width <= max_width: return text, height processed = True _wrapped_lines_ = [] words = text.split(" ") _txt_ = "" for word in words: _before = _txt_ _txt_ += f"{word} " _width, _height = get_text_size(_txt_) if _width <= max_width: continue else: if _txt_.strip() == word.strip(): processed = False break _wrapped_lines_.append(_before) _txt_ = f"{word} " _wrapped_lines_.append(_txt_) if processed: _wrapped_lines_ = [line.strip() for line in _wrapped_lines_] result = "\n".join(_wrapped_lines_).strip() height = len(_wrapped_lines_) * height return result, height _wrapped_lines_ = [] chars = list(text) _txt_ = "" for word in chars: _txt_ += word _width, _height = get_text_size(_txt_) if _width <= max_width: continue else: _wrapped_lines_.append(_txt_) _txt_ = "" _wrapped_lines_.append(_txt_) result = "\n".join(_wrapped_lines_).strip() height = len(_wrapped_lines_) * height return result, height if __name__ == '__main__': merger_mp4 = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger.mp4' merger_sub = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merged_subtitle_00_00_00-00_01_30.srt' merger_audio = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/merger_audio.mp3' bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3' output_video = '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/combined_test.mp4' # 调用示例 options = { 'voice_volume': 1.0, # 配音音量 'bgm_volume': 0.1, # 背景音乐音量 'original_audio_volume': 1.0, # 视频原声音量,0表示不保留 'keep_original_audio': True, # 是否保留原声 'subtitle_enabled': True, # 是否启用字幕 - 修复字幕开关bug 'subtitle_font': 'MicrosoftYaHeiNormal.ttc', # 这里使用相对字体路径,会自动在 font_dir() 目录下查找 'subtitle_font_size': 40, 'subtitle_color': '#FFFFFF', 'subtitle_bg_color': None, # 直接使用None表示透明背景 'subtitle_position': 'bottom', 'threads': 2 } try: merge_materials( video_path=merger_mp4, audio_path=merger_audio, subtitle_path=merger_sub, bgm_path=bgm_path, output_path=output_video, options=options ) except Exception as e: logger.error(f"合并素材失败: \n{traceback.format_exc()}") ================================================ FILE: app/services/llm/__init__.py ================================================ """ NarratoAI 大模型服务模块 统一的大模型服务抽象层,支持多供应商切换和严格的输出格式验证 包含视觉模型和文本生成模型的统一接口 主要组件: - BaseLLMProvider: 大模型服务提供商基类 - VisionModelProvider: 视觉模型提供商基类 - TextModelProvider: 文本模型提供商基类 - LLMServiceManager: 大模型服务管理器 - OutputValidator: 输出格式验证器 支持的供应商: 视觉模型: Gemini, QwenVL, Siliconflow 文本模型: OpenAI, DeepSeek, Gemini, Qwen, Moonshot, Siliconflow """ from .manager import LLMServiceManager from .base import BaseLLMProvider, VisionModelProvider, TextModelProvider from .validators import OutputValidator, ValidationError from .exceptions import LLMServiceError, ProviderNotFoundError, ConfigurationError # 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构) # 这样更可靠,错误也更容易调试 __all__ = [ 'LLMServiceManager', 'BaseLLMProvider', 'VisionModelProvider', 'TextModelProvider', 'OutputValidator', 'ValidationError', 'LLMServiceError', 'ProviderNotFoundError', 'ConfigurationError' ] # 版本信息 __version__ = '1.0.0' ================================================ FILE: app/services/llm/base.py ================================================ """ 大模型服务提供商基类定义 定义了统一的大模型服务接口,包括视觉模型和文本生成模型的抽象基类 """ from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional, Union from pathlib import Path import PIL.Image from loguru import logger from .exceptions import LLMServiceError, ConfigurationError class BaseLLMProvider(ABC): """大模型服务提供商基类""" def __init__(self, api_key: str, model_name: str, base_url: Optional[str] = None, **kwargs): """ 初始化大模型服务提供商 Args: api_key: API密钥 model_name: 模型名称 base_url: API基础URL **kwargs: 其他配置参数 """ self.api_key = api_key self.model_name = model_name self.base_url = base_url self.config = kwargs # 验证必要配置 self._validate_config() # 初始化提供商特定设置 self._initialize() @property @abstractmethod def provider_name(self) -> str: """供应商名称""" pass @property @abstractmethod def supported_models(self) -> List[str]: """支持的模型列表""" pass def _validate_config(self): """验证配置参数""" if not self.api_key: raise ConfigurationError("API密钥不能为空", "api_key") if not self.model_name: raise ConfigurationError("模型名称不能为空", "model_name") # 检查模型支持情况 self._validate_model_support() def _validate_model_support(self): """验证模型支持情况(宽松模式,仅记录警告)""" from loguru import logger # LiteLLM 已提供统一的模型验证,传统 provider 使用宽松验证 if self.model_name not in self.supported_models: logger.warning( f"模型 {self.model_name} 未在供应商 {self.provider_name} 的预定义支持列表中。" f"支持的模型列表: {self.supported_models}" ) def _initialize(self): """初始化提供商特定设置,子类可重写""" pass @abstractmethod async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]: """执行API调用,子类必须实现""" pass def _handle_api_error(self, status_code: int, response_text: str) -> LLMServiceError: """处理API错误,返回适当的异常""" from .exceptions import APICallError, RateLimitError, AuthenticationError if status_code == 401: return AuthenticationError() elif status_code == 429: return RateLimitError() elif status_code in [502, 503, 504]: return APICallError(f"服务器错误 HTTP {status_code}", status_code, response_text) elif status_code == 524: return APICallError(f"服务器处理超时 HTTP {status_code}", status_code, response_text) else: return APICallError(f"HTTP {status_code}", status_code, response_text) class VisionModelProvider(BaseLLMProvider): """视觉模型提供商基类""" @abstractmethod async def analyze_images(self, images: List[Union[str, Path, PIL.Image.Image]], prompt: str, batch_size: int = 10, **kwargs) -> List[str]: """ 分析图片并返回结果 Args: images: 图片路径列表或PIL图片对象列表 prompt: 分析提示词 batch_size: 批处理大小 **kwargs: 其他参数 Returns: 分析结果列表 """ pass def _prepare_images(self, images: List[Union[str, Path, PIL.Image.Image]]) -> List[PIL.Image.Image]: """预处理图片,统一转换为PIL.Image对象""" processed_images = [] for img in images: try: if isinstance(img, (str, Path)): pil_img = PIL.Image.open(img) elif isinstance(img, PIL.Image.Image): pil_img = img else: logger.warning(f"不支持的图片类型: {type(img)}") continue # 调整图片大小以优化性能 if pil_img.size[0] > 1024 or pil_img.size[1] > 1024: pil_img.thumbnail((1024, 1024), PIL.Image.Resampling.LANCZOS) processed_images.append(pil_img) except Exception as e: logger.error(f"加载图片失败 {img}: {str(e)}") continue return processed_images class TextModelProvider(BaseLLMProvider): """文本生成模型提供商基类""" @abstractmethod async def generate_text(self, prompt: str, system_prompt: Optional[str] = None, temperature: float = 1.0, max_tokens: Optional[int] = None, response_format: Optional[str] = None, **kwargs) -> str: """ 生成文本内容 Args: prompt: 用户提示词 system_prompt: 系统提示词 temperature: 生成温度 max_tokens: 最大token数 response_format: 响应格式 ('json' 或 None) **kwargs: 其他参数 Returns: 生成的文本内容 """ pass def _build_messages(self, prompt: str, system_prompt: Optional[str] = None) -> List[Dict[str, str]]: """构建消息列表""" messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) return messages ================================================ FILE: app/services/llm/config_validator.py ================================================ """ LLM服务配置验证器 验证大模型服务的配置是否正确,并提供配置建议 """ from typing import Dict, List, Any, Optional from loguru import logger from app.config import config from .manager import LLMServiceManager from .exceptions import ConfigurationError class LLMConfigValidator: """LLM服务配置验证器""" @staticmethod def validate_all_configs() -> Dict[str, Any]: """ 验证所有LLM服务配置 Returns: 验证结果字典 """ results = { "vision_providers": {}, "text_providers": {}, "summary": { "total_vision_providers": 0, "valid_vision_providers": 0, "total_text_providers": 0, "valid_text_providers": 0, "errors": [], "warnings": [] } } # 验证视觉模型提供商 vision_providers = LLMServiceManager.list_vision_providers() results["summary"]["total_vision_providers"] = len(vision_providers) for provider in vision_providers: try: validation_result = LLMConfigValidator.validate_vision_provider(provider) results["vision_providers"][provider] = validation_result if validation_result["is_valid"]: results["summary"]["valid_vision_providers"] += 1 else: results["summary"]["errors"].extend(validation_result["errors"]) except Exception as e: error_msg = f"验证视觉模型提供商 {provider} 时发生错误: {str(e)}" results["vision_providers"][provider] = { "is_valid": False, "errors": [error_msg], "warnings": [] } results["summary"]["errors"].append(error_msg) # 验证文本模型提供商 text_providers = LLMServiceManager.list_text_providers() results["summary"]["total_text_providers"] = len(text_providers) for provider in text_providers: try: validation_result = LLMConfigValidator.validate_text_provider(provider) results["text_providers"][provider] = validation_result if validation_result["is_valid"]: results["summary"]["valid_text_providers"] += 1 else: results["summary"]["errors"].extend(validation_result["errors"]) except Exception as e: error_msg = f"验证文本模型提供商 {provider} 时发生错误: {str(e)}" results["text_providers"][provider] = { "is_valid": False, "errors": [error_msg], "warnings": [] } results["summary"]["errors"].append(error_msg) return results @staticmethod def validate_vision_provider(provider_name: str) -> Dict[str, Any]: """ 验证视觉模型提供商配置 Args: provider_name: 提供商名称 Returns: 验证结果字典 """ result = { "is_valid": False, "errors": [], "warnings": [], "config": {} } try: # 获取配置 config_prefix = f"vision_{provider_name}" api_key = config.app.get(f'{config_prefix}_api_key') model_name = config.app.get(f'{config_prefix}_model_name') base_url = config.app.get(f'{config_prefix}_base_url') result["config"] = { "api_key": "***" if api_key else None, "model_name": model_name, "base_url": base_url } # 验证必需配置 if not api_key: result["errors"].append(f"缺少API密钥配置: {config_prefix}_api_key") if not model_name: result["errors"].append(f"缺少模型名称配置: {config_prefix}_model_name") # 尝试创建提供商实例 if api_key and model_name: try: provider_instance = LLMServiceManager.get_vision_provider(provider_name) result["is_valid"] = True logger.debug(f"视觉模型提供商 {provider_name} 配置验证成功") except Exception as e: result["errors"].append(f"创建提供商实例失败: {str(e)}") # 添加警告 if not base_url: result["warnings"].append(f"未配置base_url,将使用默认值") except Exception as e: result["errors"].append(f"配置验证过程中发生错误: {str(e)}") return result @staticmethod def validate_text_provider(provider_name: str) -> Dict[str, Any]: """ 验证文本模型提供商配置 Args: provider_name: 提供商名称 Returns: 验证结果字典 """ result = { "is_valid": False, "errors": [], "warnings": [], "config": {} } try: # 获取配置 config_prefix = f"text_{provider_name}" api_key = config.app.get(f'{config_prefix}_api_key') model_name = config.app.get(f'{config_prefix}_model_name') base_url = config.app.get(f'{config_prefix}_base_url') result["config"] = { "api_key": "***" if api_key else None, "model_name": model_name, "base_url": base_url } # 验证必需配置 if not api_key: result["errors"].append(f"缺少API密钥配置: {config_prefix}_api_key") if not model_name: result["errors"].append(f"缺少模型名称配置: {config_prefix}_model_name") # 尝试创建提供商实例 if api_key and model_name: try: provider_instance = LLMServiceManager.get_text_provider(provider_name) result["is_valid"] = True logger.debug(f"文本模型提供商 {provider_name} 配置验证成功") except Exception as e: result["errors"].append(f"创建提供商实例失败: {str(e)}") # 添加警告 if not base_url: result["warnings"].append(f"未配置base_url,将使用默认值") except Exception as e: result["errors"].append(f"配置验证过程中发生错误: {str(e)}") return result @staticmethod def get_config_suggestions() -> Dict[str, Any]: """ 获取配置建议 Returns: 配置建议字典 """ suggestions = { "vision_providers": {}, "text_providers": {}, "general_tips": [ "确保所有API密钥都已正确配置", "建议为每个提供商配置base_url以提高稳定性", "定期检查模型名称是否为最新版本", "建议配置多个提供商作为备用方案", "推荐使用 LiteLLM 作为统一接口,支持 100+ providers" ] } # 为每个视觉模型提供商提供建议 vision_providers = LLMServiceManager.list_vision_providers() for provider in vision_providers: suggestions["vision_providers"][provider] = { "required_configs": [ f"vision_{provider}_api_key", f"vision_{provider}_model_name" ], "optional_configs": [ f"vision_{provider}_base_url" ], "example_models": LLMConfigValidator._get_example_models(provider, "vision") } # 为每个文本模型提供商提供建议 text_providers = LLMServiceManager.list_text_providers() for provider in text_providers: suggestions["text_providers"][provider] = { "required_configs": [ f"text_{provider}_api_key", f"text_{provider}_model_name" ], "optional_configs": [ f"text_{provider}_base_url" ], "example_models": LLMConfigValidator._get_example_models(provider, "text") } return suggestions @staticmethod def _get_example_models(provider_name: str, model_type: str) -> List[str]: """获取示例模型名称""" examples = { "gemini": { "vision": ["gemini-2.5-flash", "gemini-2.0-flash-lite", "gemini-2.0-flash"], "text": ["gemini-2.5-flash", "gemini-2.0-flash", "gemini-1.5-pro"] }, "openai": { "vision": [], "text": ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"] }, "qwen": { "vision": ["qwen2.5-vl-32b-instruct"], "text": ["qwen-plus-1127", "qwen-turbo"] }, "deepseek": { "vision": [], "text": ["deepseek-chat", "deepseek-reasoner"] }, "siliconflow": { "vision": ["Qwen/Qwen2.5-VL-32B-Instruct"], "text": ["deepseek-ai/DeepSeek-R1", "Qwen/Qwen2.5-72B-Instruct"] } } return examples.get(provider_name, {}).get(model_type, []) @staticmethod def print_validation_report(validation_results: Dict[str, Any]): """ 打印验证报告 Args: validation_results: 验证结果 """ summary = validation_results["summary"] print("\n" + "="*60) print("LLM服务配置验证报告") print("="*60) print(f"\n📊 总体统计:") print(f" 视觉模型提供商: {summary['valid_vision_providers']}/{summary['total_vision_providers']} 有效") print(f" 文本模型提供商: {summary['valid_text_providers']}/{summary['total_text_providers']} 有效") if summary["errors"]: print(f"\n❌ 错误 ({len(summary['errors'])}):") for error in summary["errors"]: print(f" - {error}") if summary["warnings"]: print(f"\n⚠️ 警告 ({len(summary['warnings'])}):") for warning in summary["warnings"]: print(f" - {warning}") print(f"\n✅ 配置验证完成") print("="*60) ================================================ FILE: app/services/llm/exceptions.py ================================================ """ 大模型服务异常类定义 定义了大模型服务中可能出现的各种异常类型, 提供统一的错误处理机制 """ from typing import Optional, Dict, Any class LLMServiceError(Exception): """大模型服务基础异常类""" def __init__(self, message: str, error_code: Optional[str] = None, details: Optional[Dict[str, Any]] = None): super().__init__(message) self.message = message self.error_code = error_code self.details = details or {} def __str__(self): if self.error_code: return f"[{self.error_code}] {self.message}" return self.message class ProviderNotFoundError(LLMServiceError): """供应商未找到异常""" def __init__(self, provider_name: str): super().__init__( message=f"未找到大模型供应商: {provider_name}", error_code="PROVIDER_NOT_FOUND", details={"provider_name": provider_name} ) class ConfigurationError(LLMServiceError): """配置错误异常""" def __init__(self, message: str, config_key: Optional[str] = None): super().__init__( message=f"配置错误: {message}", error_code="CONFIGURATION_ERROR", details={"config_key": config_key} if config_key else {} ) class APICallError(LLMServiceError): """API调用错误异常""" def __init__(self, message: str, status_code: Optional[int] = None, response_text: Optional[str] = None): super().__init__( message=f"API调用失败: {message}", error_code="API_CALL_ERROR", details={ "status_code": status_code, "response_text": response_text } ) class ValidationError(LLMServiceError): """输出验证错误异常""" def __init__(self, message: str, validation_type: Optional[str] = None, invalid_data: Optional[Any] = None): super().__init__( message=f"输出验证失败: {message}", error_code="VALIDATION_ERROR", details={ "validation_type": validation_type, "invalid_data": str(invalid_data) if invalid_data else None } ) class ModelNotSupportedError(LLMServiceError): """模型不支持异常""" def __init__(self, model_name: str, provider_name: str): super().__init__( message=f"供应商 {provider_name} 不支持模型 {model_name}", error_code="MODEL_NOT_SUPPORTED", details={ "model_name": model_name, "provider_name": provider_name } ) class RateLimitError(LLMServiceError): """API速率限制异常""" def __init__(self, message: str = "API调用频率超限", retry_after: Optional[int] = None): super().__init__( message=message, error_code="RATE_LIMIT_ERROR", details={"retry_after": retry_after} ) class AuthenticationError(LLMServiceError): """认证错误异常""" def __init__(self, message: str = "API密钥无效或权限不足"): super().__init__( message=message, error_code="AUTHENTICATION_ERROR" ) class ContentFilterError(LLMServiceError): """内容过滤异常""" def __init__(self, message: str = "内容被安全过滤器阻止"): super().__init__( message=message, error_code="CONTENT_FILTER_ERROR" ) ================================================ FILE: app/services/llm/litellm_provider.py ================================================ """ LiteLLM 统一提供商实现 使用 LiteLLM 库提供统一的 LLM 接口,支持 100+ providers 包括 OpenAI, Anthropic, Gemini, Qwen, DeepSeek, SiliconFlow 等 """ import asyncio import base64 import io from typing import List, Dict, Any, Optional, Union from pathlib import Path import PIL.Image from loguru import logger try: import litellm from litellm import acompletion, completion from litellm.exceptions import ( AuthenticationError as LiteLLMAuthError, RateLimitError as LiteLLMRateLimitError, BadRequestError as LiteLLMBadRequestError, APIError as LiteLLMAPIError ) except ImportError: logger.error("LiteLLM 未安装。请运行: pip install litellm") raise from .base import VisionModelProvider, TextModelProvider from .exceptions import ( APICallError, AuthenticationError, RateLimitError, ContentFilterError ) # 配置 LiteLLM 全局设置 def configure_litellm(): """配置 LiteLLM 全局参数""" from app.config import config # 设置重试次数 litellm.num_retries = config.app.get('llm_max_retries', 3) # 设置默认超时 litellm.request_timeout = config.app.get('llm_text_timeout', 180) # 启用详细日志(开发环境) # litellm.set_verbose = True logger.info(f"LiteLLM 配置完成: retries={litellm.num_retries}, timeout={litellm.request_timeout}s") # 初始化配置 configure_litellm() class LiteLLMVisionProvider(VisionModelProvider): """使用 LiteLLM 的统一视觉模型提供商""" @property def provider_name(self) -> str: # 从 model_name 中提取 provider 名称(如 "gemini/gemini-2.0-flash") if "/" in self.model_name: return self.model_name.split("/")[0] return "litellm" @property def supported_models(self) -> List[str]: # LiteLLM 支持 100+ providers 和数百个模型,无法全部列举 # 返回空列表表示跳过预定义列表检查,由 LiteLLM 在实际调用时验证 return [] def _validate_model_support(self): """ 重写模型验证逻辑 对于 LiteLLM,我们不做预定义列表检查,因为: 1. LiteLLM 支持 100+ providers 和数百个模型,无法全部列举 2. LiteLLM 会在实际调用时进行模型验证 3. 如果模型不支持,LiteLLM 会返回清晰的错误信息 这里只做基本的格式验证(可选) """ from loguru import logger # 可选:检查模型名称格式(provider/model) if "/" not in self.model_name: logger.debug( f"LiteLLM 模型名称 '{self.model_name}' 未包含 provider 前缀," f"LiteLLM 将尝试自动推断。建议使用 'provider/model' 格式,如 'gemini/gemini-2.5-flash'" ) # 不抛出异常,让 LiteLLM 在实际调用时验证 logger.debug(f"LiteLLM 视觉模型已配置: {self.model_name}") def _initialize(self): """初始化 LiteLLM 特定设置""" # 设置 API key 到环境变量(LiteLLM 会自动读取) import os # 根据 model_name 确定需要设置哪个 API key provider = self.provider_name.lower() # 映射 provider 到环境变量名 env_key_mapping = { "gemini": "GEMINI_API_KEY", "google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "qwen": "QWEN_API_KEY", "dashscope": "DASHSCOPE_API_KEY", "siliconflow": "SILICONFLOW_API_KEY", "anthropic": "ANTHROPIC_API_KEY", "claude": "ANTHROPIC_API_KEY" } env_var = env_key_mapping.get(provider, f"{provider.upper()}_API_KEY") if self.api_key and env_var: os.environ[env_var] = self.api_key logger.debug(f"设置环境变量: {env_var}") # 如果提供了 base_url,设置到 LiteLLM if self.base_url: # LiteLLM 支持通过 api_base 参数设置自定义 URL self._api_base = self.base_url logger.debug(f"使用自定义 API base URL: {self.base_url}") async def analyze_images(self, images: List[Union[str, Path, PIL.Image.Image]], prompt: str, batch_size: int = 10, **kwargs) -> List[str]: """ 使用 LiteLLM 分析图片 Args: images: 图片路径列表或PIL图片对象列表 prompt: 分析提示词 batch_size: 批处理大小 **kwargs: 其他参数 Returns: 分析结果列表 """ logger.info(f"开始使用 LiteLLM ({self.model_name}) 分析 {len(images)} 张图片") # 预处理图片 processed_images = self._prepare_images(images) # 分批处理 results = [] for i in range(0, len(processed_images), batch_size): batch = processed_images[i:i + batch_size] logger.info(f"处理第 {i//batch_size + 1} 批,共 {len(batch)} 张图片") try: result = await self._analyze_batch(batch, prompt, **kwargs) results.append(result) except Exception as e: logger.error(f"批次 {i//batch_size + 1} 处理失败: {str(e)}") results.append(f"批次处理失败: {str(e)}") return results async def _analyze_batch(self, batch: List[PIL.Image.Image], prompt: str, **kwargs) -> str: """分析一批图片""" # 构建 LiteLLM 格式的消息 content = [{"type": "text", "text": prompt}] # 添加图片(使用 base64 编码) for img in batch: base64_image = self._image_to_base64(img) content.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } }) messages = [{ "role": "user", "content": content }] # 调用 LiteLLM try: # 准备参数 effective_model_name = self.model_name # SiliconFlow 特殊处理 if self.model_name.lower().startswith("siliconflow/"): # 替换 provider 为 openai if "/" in self.model_name: effective_model_name = f"openai/{self.model_name.split('/', 1)[1]}" else: effective_model_name = f"openai/{self.model_name}" # 确保设置了 OPENAI_API_KEY (如果尚未设置) import os if not os.environ.get("OPENAI_API_KEY") and os.environ.get("SILICONFLOW_API_KEY"): os.environ["OPENAI_API_KEY"] = os.environ.get("SILICONFLOW_API_KEY") # 确保设置了 base_url (如果尚未设置) if not hasattr(self, '_api_base'): self._api_base = "https://api.siliconflow.cn/v1" completion_kwargs = { "model": effective_model_name, "messages": messages, "temperature": kwargs.get("temperature", 1.0), "max_tokens": kwargs.get("max_tokens", 4000) } # 如果有自定义 base_url,添加 api_base 参数 if hasattr(self, '_api_base'): completion_kwargs["api_base"] = self._api_base # 支持动态传递 api_key 和 api_base if "api_key" in kwargs: completion_kwargs["api_key"] = kwargs["api_key"] if "api_base" in kwargs: completion_kwargs["api_base"] = kwargs["api_base"] response = await acompletion(**completion_kwargs) if response.choices and len(response.choices) > 0: content = response.choices[0].message.content logger.debug(f"LiteLLM 调用成功,消耗 tokens: {response.usage.total_tokens if response.usage else 'N/A'}") return content else: raise APICallError("LiteLLM 返回空响应") except LiteLLMAuthError as e: logger.error(f"LiteLLM 认证失败: {str(e)}") raise AuthenticationError() except LiteLLMRateLimitError as e: logger.error(f"LiteLLM 速率限制: {str(e)}") raise RateLimitError() except LiteLLMBadRequestError as e: error_msg = str(e) if "SAFETY" in error_msg.upper() or "content_filter" in error_msg.lower(): raise ContentFilterError(f"内容被安全过滤器阻止: {error_msg}") logger.error(f"LiteLLM 请求错误: {error_msg}") raise APICallError(f"请求错误: {error_msg}") except LiteLLMAPIError as e: logger.error(f"LiteLLM API 错误: {str(e)}") raise APICallError(f"API 错误: {str(e)}") except Exception as e: logger.error(f"LiteLLM 调用失败: {str(e)}") raise APICallError(f"调用失败: {str(e)}") def _image_to_base64(self, img: PIL.Image.Image) -> str: """将PIL图片转换为base64编码""" img_buffer = io.BytesIO() img.save(img_buffer, format='JPEG', quality=85) img_bytes = img_buffer.getvalue() return base64.b64encode(img_bytes).decode('utf-8') async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]: """兼容基类接口(实际使用 LiteLLM SDK)""" pass class LiteLLMTextProvider(TextModelProvider): """使用 LiteLLM 的统一文本生成提供商""" @property def provider_name(self) -> str: # 从 model_name 中提取 provider 名称 if "/" in self.model_name: return self.model_name.split("/")[0] # 尝试从模型名称推断 provider model_lower = self.model_name.lower() if "gpt" in model_lower: return "openai" elif "claude" in model_lower: return "anthropic" elif "gemini" in model_lower: return "gemini" elif "qwen" in model_lower: return "qwen" elif "deepseek" in model_lower: return "deepseek" return "litellm" @property def supported_models(self) -> List[str]: # LiteLLM 支持 100+ providers 和数百个模型,无法全部列举 # 返回空列表表示跳过预定义列表检查,由 LiteLLM 在实际调用时验证 return [] def _validate_model_support(self): """ 重写模型验证逻辑 对于 LiteLLM,我们不做预定义列表检查,因为: 1. LiteLLM 支持 100+ providers 和数百个模型,无法全部列举 2. LiteLLM 会在实际调用时进行模型验证 3. 如果模型不支持,LiteLLM 会返回清晰的错误信息 这里只做基本的格式验证(可选) """ from loguru import logger # 可选:检查模型名称格式(provider/model) if "/" not in self.model_name: logger.debug( f"LiteLLM 模型名称 '{self.model_name}' 未包含 provider 前缀," f"LiteLLM 将尝试自动推断。建议使用 'provider/model' 格式,如 'gemini/gemini-2.5-flash'" ) # 不抛出异常,让 LiteLLM 在实际调用时验证 logger.debug(f"LiteLLM 文本模型已配置: {self.model_name}") def _initialize(self): """初始化 LiteLLM 特定设置""" import os # 根据 model_name 确定需要设置哪个 API key provider = self.provider_name.lower() # 映射 provider 到环境变量名 env_key_mapping = { "gemini": "GEMINI_API_KEY", "google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "qwen": "QWEN_API_KEY", "dashscope": "DASHSCOPE_API_KEY", "siliconflow": "SILICONFLOW_API_KEY", "deepseek": "DEEPSEEK_API_KEY", "anthropic": "ANTHROPIC_API_KEY", "claude": "ANTHROPIC_API_KEY", "moonshot": "MOONSHOT_API_KEY" } env_var = env_key_mapping.get(provider, f"{provider.upper()}_API_KEY") if self.api_key and env_var: os.environ[env_var] = self.api_key logger.debug(f"设置环境变量: {env_var}") # 如果提供了 base_url,保存用于后续调用 if self.base_url: self._api_base = self.base_url logger.debug(f"使用自定义 API base URL: {self.base_url}") async def generate_text(self, prompt: str, system_prompt: Optional[str] = None, temperature: float = 1.0, max_tokens: Optional[int] = None, response_format: Optional[str] = None, **kwargs) -> str: """ 使用 LiteLLM 生成文本 Args: prompt: 用户提示词 system_prompt: 系统提示词 temperature: 生成温度 max_tokens: 最大token数 response_format: 响应格式 ('json' 或 None) **kwargs: 其他参数 Returns: 生成的文本内容 """ # 构建消息列表 messages = self._build_messages(prompt, system_prompt) # 准备参数 effective_model_name = self.model_name # SiliconFlow 特殊处理 if self.model_name.lower().startswith("siliconflow/"): # 替换 provider 为 openai if "/" in self.model_name: effective_model_name = f"openai/{self.model_name.split('/', 1)[1]}" else: effective_model_name = f"openai/{self.model_name}" # 确保设置了 OPENAI_API_KEY (如果尚未设置) import os if not os.environ.get("OPENAI_API_KEY") and os.environ.get("SILICONFLOW_API_KEY"): os.environ["OPENAI_API_KEY"] = os.environ.get("SILICONFLOW_API_KEY") # 确保设置了 base_url (如果尚未设置) if not hasattr(self, '_api_base'): self._api_base = "https://api.siliconflow.cn/v1" completion_kwargs = { "model": effective_model_name, "messages": messages, "temperature": temperature } if max_tokens: completion_kwargs["max_tokens"] = max_tokens # 处理 JSON 格式输出 # LiteLLM 会自动处理不同 provider 的 JSON mode 差异 if response_format == "json": try: completion_kwargs["response_format"] = {"type": "json_object"} except Exception as e: # 如果不支持,在提示词中添加约束 logger.warning(f"模型可能不支持 response_format,将在提示词中添加 JSON 约束: {str(e)}") messages[-1]["content"] += "\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" # 如果有自定义 base_url,添加 api_base 参数 if hasattr(self, '_api_base'): completion_kwargs["api_base"] = self._api_base # 支持动态传递 api_key 和 api_base (修复认证问题) if "api_key" in kwargs: completion_kwargs["api_key"] = kwargs["api_key"] if "api_base" in kwargs: completion_kwargs["api_base"] = kwargs["api_base"] try: # 调用 LiteLLM(自动重试) response = await acompletion(**completion_kwargs) if response.choices and len(response.choices) > 0: content = response.choices[0].message.content # 清理可能的 markdown 代码块(针对不支持 JSON mode 的模型) if response_format == "json" and "response_format" not in completion_kwargs: content = self._clean_json_output(content) logger.debug(f"LiteLLM 调用成功,消耗 tokens: {response.usage.total_tokens if response.usage else 'N/A'}") return content else: raise APICallError("LiteLLM 返回空响应") except LiteLLMAuthError as e: logger.error(f"LiteLLM 认证失败: {str(e)}") raise AuthenticationError() except LiteLLMRateLimitError as e: logger.error(f"LiteLLM 速率限制: {str(e)}") raise RateLimitError() except LiteLLMBadRequestError as e: error_msg = str(e) # 处理不支持 response_format 的情况 if "response_format" in error_msg and response_format == "json": logger.warning(f"模型不支持 response_format,重试不带格式约束的请求") completion_kwargs.pop("response_format", None) messages[-1]["content"] += "\n\n请确保输出严格的JSON格式,不要包含任何其他文字或标记。" # 重试 response = await acompletion(**completion_kwargs) if response.choices and len(response.choices) > 0: content = response.choices[0].message.content content = self._clean_json_output(content) return content else: raise APICallError("LiteLLM 返回空响应") # 检查是否是安全过滤 if "SAFETY" in error_msg.upper() or "content_filter" in error_msg.lower(): raise ContentFilterError(f"内容被安全过滤器阻止: {error_msg}") logger.error(f"LiteLLM 请求错误: {error_msg}") raise APICallError(f"请求错误: {error_msg}") except LiteLLMAPIError as e: logger.error(f"LiteLLM API 错误: {str(e)}") raise APICallError(f"API 错误: {str(e)}") except Exception as e: logger.error(f"LiteLLM 调用失败: {str(e)}") raise APICallError(f"调用失败: {str(e)}") def _clean_json_output(self, output: str) -> str: """清理JSON输出,移除markdown标记等""" import re # 移除可能的markdown代码块标记 output = re.sub(r'^```json\s*', '', output, flags=re.MULTILINE) output = re.sub(r'^```\s*$', '', output, flags=re.MULTILINE) output = re.sub(r'^```.*$', '', output, flags=re.MULTILINE) # 移除前后空白字符 output = output.strip() return output async def _make_api_call(self, payload: Dict[str, Any]) -> Dict[str, Any]: """兼容基类接口(实际使用 LiteLLM SDK)""" pass ================================================ FILE: app/services/llm/manager.py ================================================ """ 大模型服务管理器 统一管理所有大模型服务提供商,提供简单的工厂方法来创建和获取服务实例 """ from typing import Dict, Type, Optional from loguru import logger from app.config import config from .base import VisionModelProvider, TextModelProvider from .exceptions import ProviderNotFoundError, ConfigurationError class LLMServiceManager: """大模型服务管理器""" # 注册的视觉模型提供商 _vision_providers: Dict[str, Type[VisionModelProvider]] = {} # 注册的文本模型提供商 _text_providers: Dict[str, Type[TextModelProvider]] = {} # 缓存的提供商实例 _vision_instance_cache: Dict[str, VisionModelProvider] = {} _text_instance_cache: Dict[str, TextModelProvider] = {} @classmethod def register_vision_provider(cls, name: str, provider_class: Type[VisionModelProvider]): """注册视觉模型提供商""" cls._vision_providers[name.lower()] = provider_class logger.debug(f"注册视觉模型提供商: {name}") @classmethod def register_text_provider(cls, name: str, provider_class: Type[TextModelProvider]): """注册文本模型提供商""" cls._text_providers[name.lower()] = provider_class logger.debug(f"注册文本模型提供商: {name}") # _ensure_providers_registered() 方法已移除 # 现在使用显式注册机制(见 webui.py:main()) # 如需检查注册状态,使用 is_registered() 方法 @classmethod def is_registered(cls) -> bool: """ 检查是否已注册提供商 Returns: bool: 如果已注册任何提供商则返回 True """ return len(cls._text_providers) > 0 or len(cls._vision_providers) > 0 @classmethod def get_registered_providers_info(cls) -> dict: """ 获取已注册提供商的信息 Returns: dict: 包含视觉和文本提供商列表的字典 """ return { "vision_providers": list(cls._vision_providers.keys()), "text_providers": list(cls._text_providers.keys()) } @classmethod def get_vision_provider(cls, provider_name: Optional[str] = None) -> VisionModelProvider: """ 获取视觉模型提供商实例 Args: provider_name: 提供商名称,如果不指定则从配置中获取 Returns: 视觉模型提供商实例 Raises: ProviderNotFoundError: 提供商未找到 ConfigurationError: 配置错误 """ # 检查提供商是否已注册 if not cls.is_registered(): raise ConfigurationError( "LLM 提供商未注册。请确保在应用启动时调用了 register_all_providers()。" f"\n当前已注册的提供商: {cls.get_registered_providers_info()}" ) # 确定提供商名称 if not provider_name: provider_name = config.app.get('vision_llm_provider', 'gemini').lower() else: provider_name = provider_name.lower() # 检查缓存 cache_key = f"vision_{provider_name}" if cache_key in cls._vision_instance_cache: return cls._vision_instance_cache[cache_key] # 检查提供商是否已注册 if provider_name not in cls._vision_providers: raise ProviderNotFoundError(provider_name) # 获取配置 config_prefix = f"vision_{provider_name}" api_key = config.app.get(f'{config_prefix}_api_key') model_name = config.app.get(f'{config_prefix}_model_name') base_url = config.app.get(f'{config_prefix}_base_url') if not api_key: raise ConfigurationError(f"缺少API密钥配置: {config_prefix}_api_key") if not model_name: raise ConfigurationError(f"缺少模型名称配置: {config_prefix}_model_name") # 创建提供商实例 provider_class = cls._vision_providers[provider_name] try: instance = provider_class( api_key=api_key, model_name=model_name, base_url=base_url ) # 缓存实例 cls._vision_instance_cache[cache_key] = instance logger.info(f"创建视觉模型提供商实例: {provider_name} - {model_name}") return instance except Exception as e: logger.error(f"创建视觉模型提供商实例失败: {provider_name} - {str(e)}") raise ConfigurationError(f"创建提供商实例失败: {str(e)}") @classmethod def get_text_provider(cls, provider_name: Optional[str] = None) -> TextModelProvider: """ 获取文本模型提供商实例 Args: provider_name: 提供商名称,如果不指定则从配置中获取 Returns: 文本模型提供商实例 Raises: ProviderNotFoundError: 提供商未找到 ConfigurationError: 配置错误 """ # 检查提供商是否已注册 if not cls.is_registered(): raise ConfigurationError( "LLM 提供商未注册。请确保在应用启动时调用了 register_all_providers()。" f"\n当前已注册的提供商: {cls.get_registered_providers_info()}" ) # 确定提供商名称 if not provider_name: provider_name = config.app.get('text_llm_provider', 'openai').lower() else: provider_name = provider_name.lower() logger.debug(f"获取文本模型提供商: {provider_name}") logger.debug(f"已注册的文本提供商: {list(cls._text_providers.keys())}") # 检查缓存 cache_key = f"text_{provider_name}" if cache_key in cls._text_instance_cache: logger.debug(f"从缓存获取提供商实例: {provider_name}") return cls._text_instance_cache[cache_key] # 检查提供商是否已注册 if provider_name not in cls._text_providers: logger.error(f"提供商未注册: {provider_name}") logger.error(f"已注册的提供商列表: {list(cls._text_providers.keys())}") raise ProviderNotFoundError(provider_name) # 获取配置 config_prefix = f"text_{provider_name}" api_key = config.app.get(f'{config_prefix}_api_key') model_name = config.app.get(f'{config_prefix}_model_name') base_url = config.app.get(f'{config_prefix}_base_url') if not api_key: raise ConfigurationError(f"缺少API密钥配置: {config_prefix}_api_key") if not model_name: raise ConfigurationError(f"缺少模型名称配置: {config_prefix}_model_name") # 创建提供商实例 provider_class = cls._text_providers[provider_name] try: instance = provider_class( api_key=api_key, model_name=model_name, base_url=base_url ) # 缓存实例 cls._text_instance_cache[cache_key] = instance logger.info(f"创建文本模型提供商实例: {provider_name} - {model_name}") return instance except Exception as e: logger.error(f"创建文本模型提供商实例失败: {provider_name} - {str(e)}") raise ConfigurationError(f"创建提供商实例失败: {str(e)}") @classmethod def clear_cache(cls): """清空提供商实例缓存""" cls._vision_instance_cache.clear() cls._text_instance_cache.clear() logger.info("已清空提供商实例缓存") @classmethod def list_vision_providers(cls) -> list[str]: """列出所有已注册的视觉模型提供商""" return list(cls._vision_providers.keys()) @classmethod def list_text_providers(cls) -> list[str]: """列出所有已注册的文本模型提供商""" return list(cls._text_providers.keys()) @classmethod def get_provider_info(cls) -> Dict[str, Dict[str, any]]: """获取所有提供商信息""" return { "vision_providers": { name: { "class": provider_class.__name__, "module": provider_class.__module__ } for name, provider_class in cls._vision_providers.items() }, "text_providers": { name: { "class": provider_class.__name__, "module": provider_class.__module__ } for name, provider_class in cls._text_providers.items() } } ================================================ FILE: app/services/llm/migration_adapter.py ================================================ """ 迁移适配器 为现有代码提供向后兼容的接口,方便逐步迁移到新的LLM服务架构 """ import asyncio import json from typing import List, Dict, Any, Optional, Union from pathlib import Path import PIL.Image from loguru import logger from .unified_service import UnifiedLLMService from .exceptions import LLMServiceError # 导入新的提示词管理系统 from app.services.prompts import PromptManager # 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构) # 这样更可靠,错误也更容易调试 def _run_async_safely(coro_func, *args, **kwargs): """ 安全地运行异步协程,处理各种事件循环情况 Args: coro_func: 协程函数(不是协程对象) *args: 协程函数的位置参数 **kwargs: 协程函数的关键字参数 Returns: 协程的执行结果 """ def run_in_new_loop(): """在新的事件循环中运行协程""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: return loop.run_until_complete(coro_func(*args, **kwargs)) finally: loop.close() asyncio.set_event_loop(None) try: # 尝试获取当前事件循环 try: loop = asyncio.get_running_loop() # 如果有运行中的事件循环,使用线程池执行 import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(run_in_new_loop) return future.result() except RuntimeError: # 没有运行中的事件循环,直接运行 return run_in_new_loop() except Exception as e: logger.error(f"异步执行失败: {str(e)}") raise LLMServiceError(f"异步执行失败: {str(e)}") class LegacyLLMAdapter: """传统LLM接口适配器""" @staticmethod def create_vision_analyzer(provider: str, api_key: str, model: str, base_url: str = None): """ 创建视觉分析器实例 - 兼容原有接口 Args: provider: 提供商名称 api_key: API密钥 model: 模型名称 base_url: API基础URL Returns: 适配器实例 """ return VisionAnalyzerAdapter(provider, api_key, model, base_url) @staticmethod def generate_narration(markdown_content: str, api_key: str, base_url: str, model: str) -> str: """ 生成解说文案 - 兼容原有接口 Args: markdown_content: Markdown格式的视频帧分析内容 api_key: API密钥 base_url: API基础URL model: 模型名称 Returns: 生成的解说文案JSON字符串 """ try: # 使用新的提示词管理系统 prompt = PromptManager.get_prompt( category="documentary", name="narration_generation", parameters={ "video_frame_description": markdown_content } ) # 使用统一服务生成文案 result = _run_async_safely( UnifiedLLMService.generate_text, prompt=prompt, system_prompt="你是一名专业的短视频解说文案撰写专家。", temperature=1.5, response_format="json" ) # 使用增强的JSON解析器 from webui.tools.generate_short_summary import parse_and_fix_json parsed_result = parse_and_fix_json(result) if not parsed_result: logger.error("无法解析LLM返回的JSON数据") # 返回一个基本的JSON结构而不是错误字符串 return json.dumps({ "items": [ { "_id": 1, "timestamp": "00:00:00-00:00:10", "picture": "解析失败,请检查LLM输出", "narration": "解说文案生成失败,请重试" } ] }, ensure_ascii=False) # 确保返回的是JSON字符串 return json.dumps(parsed_result, ensure_ascii=False) except Exception as e: logger.error(f"生成解说文案失败: {str(e)}") # 返回一个基本的JSON结构而不是错误字符串 return json.dumps({ "items": [ { "_id": 1, "timestamp": "00:00:00-00:00:10", "picture": "生成失败", "narration": f"解说文案生成失败: {str(e)}" } ] }, ensure_ascii=False) class VisionAnalyzerAdapter: """视觉分析器适配器""" def __init__(self, provider: str, api_key: str, model: str, base_url: str = None): self.provider = provider self.api_key = api_key self.model = model self.base_url = base_url async def analyze_images(self, images: List[Union[str, Path, PIL.Image.Image]], prompt: str, batch_size: int = 10) -> List[Dict[str, Any]]: """ 分析图片 - 兼容原有接口 Args: images: 图片列表 prompt: 分析提示词 batch_size: 批处理大小 Returns: 分析结果列表,格式与旧实现兼容 """ try: # 使用统一服务分析图片 results = await UnifiedLLMService.analyze_images( images=images, prompt=prompt, provider=self.provider, batch_size=batch_size ) # 转换为旧格式以保持向后兼容性 # 新实现返回 List[str],需要转换为 List[Dict] compatible_results = [] for i, result in enumerate(results): # 计算这个批次处理的图片数量 start_idx = i * batch_size end_idx = min(start_idx + batch_size, len(images)) images_processed = end_idx - start_idx compatible_results.append({ 'batch_index': i, 'images_processed': images_processed, 'response': result, 'model_used': self.model }) logger.info(f"图片分析完成,共处理 {len(images)} 张图片,生成 {len(compatible_results)} 个批次结果") return compatible_results except Exception as e: logger.error(f"图片分析失败: {str(e)}") raise class SubtitleAnalyzerAdapter: """字幕分析器适配器""" def __init__(self, api_key: str, model: str, base_url: str, provider: str = None): self.api_key = api_key self.model = model self.base_url = base_url self.provider = provider or "openai" def _run_async_safely(self, coro_func, *args, **kwargs): """安全地运行异步协程""" return _run_async_safely(coro_func, *args, **kwargs) def _clean_json_output(self, output: str) -> str: """清理JSON输出,移除markdown标记等""" import re # 移除可能的markdown代码块标记 output = re.sub(r'^```json\s*', '', output, flags=re.MULTILINE) output = re.sub(r'^```\s*$', '', output, flags=re.MULTILINE) output = re.sub(r'^```.*$', '', output, flags=re.MULTILINE) # 移除开头和结尾的```标记 output = re.sub(r'^```', '', output) output = re.sub(r'```$', '', output) # 移除前后空白字符 output = output.strip() return output def analyze_subtitle(self, subtitle_content: str) -> Dict[str, Any]: """ 分析字幕内容 - 兼容原有接口 Args: subtitle_content: 字幕内容 Returns: 分析结果字典 """ try: # 使用统一服务分析字幕 result = self._run_async_safely( UnifiedLLMService.analyze_subtitle, subtitle_content=subtitle_content, provider=self.provider, temperature=1.0, api_key=self.api_key, api_base=self.base_url ) return { "status": "success", "analysis": result, "model": self.model, "temperature": 1.0 } except Exception as e: logger.error(f"字幕分析失败: {str(e)}") return { "status": "error", "message": str(e), "temperature": 1.0 } def generate_narration_script(self, short_name: str, plot_analysis: str, subtitle_content: str = "", temperature: float = 0.7) -> Dict[str, Any]: """ 生成解说文案 - 兼容原有接口 Args: short_name: 短剧名称 plot_analysis: 剧情分析内容 subtitle_content: 原始字幕内容,用于提供准确的时间戳信息 temperature: 生成温度 Returns: 生成结果字典 """ try: # 使用新的提示词管理系统构建提示词 prompt = PromptManager.get_prompt( category="short_drama_narration", name="script_generation", parameters={ "drama_name": short_name, "plot_analysis": plot_analysis, "subtitle_content": subtitle_content } ) # 使用统一服务生成文案 result = self._run_async_safely( UnifiedLLMService.generate_text, prompt=prompt, system_prompt="你是一位专业的短视频解说脚本撰写专家。", provider=self.provider, temperature=temperature, response_format="json", api_key=self.api_key, api_base=self.base_url ) # 清理JSON输出 cleaned_result = self._clean_json_output(result) # 新的提示词系统返回的是包含items数组的JSON格式 # 为了保持向后兼容,我们需要直接返回这个JSON字符串 # 调用方会期望这是一个包含items数组的JSON字符串 return { "status": "success", "narration_script": cleaned_result, "model": self.model, "temperature": temperature } except Exception as e: logger.error(f"解说文案生成失败: {str(e)}") return { "status": "error", "message": str(e), "temperature": temperature } # 为了向后兼容,提供一些全局函数 def create_vision_analyzer(provider: str, api_key: str, model: str, base_url: str = None): """创建视觉分析器 - 全局函数""" return LegacyLLMAdapter.create_vision_analyzer(provider, api_key, model, base_url) def generate_narration(markdown_content: str, api_key: str, base_url: str, model: str) -> str: """生成解说文案 - 全局函数""" return LegacyLLMAdapter.generate_narration(markdown_content, api_key, base_url, model) ================================================ FILE: app/services/llm/providers/__init__.py ================================================ """ 大模型服务提供商实现 包含各种大模型服务提供商的具体实现 推荐使用 LiteLLM 统一接口(支持 100+ providers) """ # 不在模块顶部导入 provider 类,避免循环依赖 # 所有导入都在 register_all_providers() 函数内部进行 def register_all_providers(): """ 注册所有提供商 v0.8.0 变更:只注册 LiteLLM 统一接口 - 移除了旧的单独 provider 实现 (gemini, openai, qwen, deepseek, siliconflow) - LiteLLM 支持 100+ providers,无需单独实现 """ # 在函数内部导入,避免循环依赖 from ..manager import LLMServiceManager from loguru import logger # 只导入 LiteLLM provider from ..litellm_provider import LiteLLMVisionProvider, LiteLLMTextProvider logger.info("🔧 开始注册 LLM 提供商...") # ===== 注册 LiteLLM 统一接口 ===== # LiteLLM 支持 100+ providers(OpenAI, Gemini, Qwen, DeepSeek, SiliconFlow, 等) LLMServiceManager.register_vision_provider('litellm', LiteLLMVisionProvider) LLMServiceManager.register_text_provider('litellm', LiteLLMTextProvider) logger.info("✅ LiteLLM 提供商注册完成(支持 100+ providers)") # 导出注册函数 __all__ = [ 'register_all_providers', ] # 注意: Provider 类不再从此模块导出,因为它们只在注册函数内部使用 # 这样做是为了避免循环依赖问题,所有 provider 类的导入都延迟到注册时进行 ================================================ FILE: app/services/llm/test_litellm_integration.py ================================================ """ LiteLLM 集成测试脚本 测试 LiteLLM provider 是否正确集成到系统中 """ import asyncio import sys from pathlib import Path # 添加项目根目录到 Python 路径 project_root = Path(__file__).parent.parent.parent.parent sys.path.insert(0, str(project_root)) from loguru import logger from app.services.llm.manager import LLMServiceManager from app.services.llm.unified_service import UnifiedLLMService def test_provider_registration(): """测试 provider 是否正确注册""" logger.info("=" * 60) logger.info("测试 1: Provider 注册检查") logger.info("=" * 60) # 检查 LiteLLM provider 是否已注册 vision_providers = LLMServiceManager.list_vision_providers() text_providers = LLMServiceManager.list_text_providers() logger.info(f"已注册的视觉模型 providers: {vision_providers}") logger.info(f"已注册的文本模型 providers: {text_providers}") assert 'litellm' in vision_providers, "❌ LiteLLM Vision Provider 未注册" assert 'litellm' in text_providers, "❌ LiteLLM Text Provider 未注册" logger.success("✅ LiteLLM providers 已成功注册") # 显示所有 provider 信息 provider_info = LLMServiceManager.get_provider_info() logger.info("\n所有 Provider 信息:") logger.info(f" 视觉模型 providers: {list(provider_info['vision_providers'].keys())}") logger.info(f" 文本模型 providers: {list(provider_info['text_providers'].keys())}") def test_litellm_import(): """测试 LiteLLM 库是否正确安装""" logger.info("\n" + "=" * 60) logger.info("测试 2: LiteLLM 库导入检查") logger.info("=" * 60) try: import litellm logger.success(f"✅ LiteLLM 已安装,版本: {litellm.__version__}") return True except ImportError as e: logger.error(f"❌ LiteLLM 未安装: {str(e)}") logger.info("请运行: pip install litellm>=1.70.0") return False async def test_text_generation_mock(): """测试文本生成接口(模拟模式,不实际调用 API)""" logger.info("\n" + "=" * 60) logger.info("测试 3: 文本生成接口(模拟)") logger.info("=" * 60) try: # 这里只测试接口是否可调用,不实际发送 API 请求 logger.info("接口测试通过:UnifiedLLMService.generate_text 可调用") logger.success("✅ 文本生成接口测试通过") return True except Exception as e: logger.error(f"❌ 文本生成接口测试失败: {str(e)}") return False async def test_vision_analysis_mock(): """测试视觉分析接口(模拟模式)""" logger.info("\n" + "=" * 60) logger.info("测试 4: 视觉分析接口(模拟)") logger.info("=" * 60) try: # 这里只测试接口是否可调用 logger.info("接口测试通过:UnifiedLLMService.analyze_images 可调用") logger.success("✅ 视觉分析接口测试通过") return True except Exception as e: logger.error(f"❌ 视觉分析接口测试失败: {str(e)}") return False def test_backward_compatibility(): """测试向后兼容性""" logger.info("\n" + "=" * 60) logger.info("测试 5: 向后兼容性检查") logger.info("=" * 60) # 检查旧的 provider 是否仍然可用 old_providers = ['gemini', 'openai', 'qwen', 'deepseek', 'siliconflow'] vision_providers = LLMServiceManager.list_vision_providers() text_providers = LLMServiceManager.list_text_providers() logger.info("检查旧 provider 是否仍然可用:") for provider in old_providers: if provider in ['openai', 'deepseek']: # 这些只有 text provider if provider in text_providers: logger.info(f" ✅ {provider} (text)") else: logger.warning(f" ⚠️ {provider} (text) 未注册") else: # 这些有 vision 和 text provider vision_ok = provider in vision_providers or f"{provider}vl" in vision_providers text_ok = provider in text_providers if vision_ok: logger.info(f" ✅ {provider} (vision)") if text_ok: logger.info(f" ✅ {provider} (text)") logger.success("✅ 向后兼容性测试通过") def print_usage_guide(): """打印使用指南""" logger.info("\n" + "=" * 60) logger.info("LiteLLM 使用指南") logger.info("=" * 60) guide = """ 📚 如何使用 LiteLLM: 1. 在 config.toml 中配置: ```toml [app] # 方式 1:直接使用 LiteLLM(推荐) vision_llm_provider = "litellm" vision_litellm_model_name = "gemini/gemini-2.0-flash-lite" vision_litellm_api_key = "your-api-key" text_llm_provider = "litellm" text_litellm_model_name = "deepseek/deepseek-chat" text_litellm_api_key = "your-api-key" ``` 2. 支持的模型格式: - Gemini: gemini/gemini-2.0-flash - DeepSeek: deepseek/deepseek-chat - Qwen: qwen/qwen-plus - OpenAI: gpt-4o, gpt-4o-mini - SiliconFlow: siliconflow/deepseek-ai/DeepSeek-R1 - 更多: 参考 https://docs.litellm.ai/docs/providers 3. 代码调用示例: ```python from app.services.llm.unified_service import UnifiedLLMService # 文本生成 result = await UnifiedLLMService.generate_text( prompt="你好", provider="litellm" ) # 视觉分析 results = await UnifiedLLMService.analyze_images( images=["path/to/image.jpg"], prompt="描述这张图片", provider="litellm" ) ``` 4. 优势: ✅ 减少 80% 代码量 ✅ 统一的错误处理 ✅ 自动重试机制 ✅ 支持 100+ providers ✅ 自动成本追踪 5. 迁移建议: - 新项目:直接使用 LiteLLM - 旧项目:逐步迁移,旧的 provider 仍然可用 - 测试充分后再切换生产环境 """ print(guide) def main(): """运行所有测试""" logger.info("开始 LiteLLM 集成测试...\n") try: # 测试 1: Provider 注册 test_provider_registration() # 测试 2: LiteLLM 库导入 litellm_available = test_litellm_import() if not litellm_available: logger.warning("\n⚠️ LiteLLM 未安装,跳过 API 测试") logger.info("请运行: pip install litellm>=1.70.0") else: # 测试 3-4: 接口测试(模拟) asyncio.run(test_text_generation_mock()) asyncio.run(test_vision_analysis_mock()) # 测试 5: 向后兼容性 test_backward_compatibility() # 打印使用指南 print_usage_guide() logger.info("\n" + "=" * 60) logger.success("🎉 所有测试通过!") logger.info("=" * 60) return True except Exception as e: logger.error(f"\n❌ 测试失败: {str(e)}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1) ================================================ FILE: app/services/llm/test_llm_service.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ LLM服务测试脚本 测试新的LLM服务架构是否正常工作 """ import asyncio import sys import os from pathlib import Path from loguru import logger # 添加项目根目录到Python路径 project_root = Path(__file__).parent.parent.parent.parent sys.path.insert(0, str(project_root)) from app.services.llm.config_validator import LLMConfigValidator from app.services.llm.unified_service import UnifiedLLMService from app.services.llm.exceptions import LLMServiceError async def test_text_generation(): """测试文本生成功能""" print("\n🔤 测试文本生成功能...") try: # 简单的文本生成测试 prompt = "请用一句话介绍人工智能。" result = await UnifiedLLMService.generate_text( prompt=prompt, system_prompt="你是一个专业的AI助手。", temperature=0.7 ) print(f"✅ 文本生成成功:") print(f" 提示词: {prompt}") print(f" 生成结果: {result[:100]}...") return True except Exception as e: print(f"❌ 文本生成失败: {str(e)}") return False async def test_json_generation(): """测试JSON格式生成功能""" print("\n📄 测试JSON格式生成功能...") try: prompt = """ 请生成一个简单的解说文案示例,包含以下字段: - title: 标题 - content: 内容 - duration: 时长(秒) 输出JSON格式。 """ result = await UnifiedLLMService.generate_text( prompt=prompt, system_prompt="你是一个专业的文案撰写专家。", temperature=0.7, response_format="json" ) # 尝试解析JSON import json parsed_result = json.loads(result) print(f"✅ JSON生成成功:") print(f" 生成结果: {json.dumps(parsed_result, ensure_ascii=False, indent=2)}") return True except json.JSONDecodeError as e: print(f"❌ JSON解析失败: {str(e)}") print(f" 原始结果: {result}") return False except Exception as e: print(f"❌ JSON生成失败: {str(e)}") return False async def test_narration_script_generation(): """测试解说文案生成功能""" print("\n🎬 测试解说文案生成功能...") try: prompt = """ 根据以下视频描述生成解说文案: 视频内容:一个人在森林中建造木屋,首先挖掘地基,然后搭建墙壁,最后安装屋顶。 请生成JSON格式的解说文案,包含items数组,每个item包含: - _id: 序号 - timestamp: 时间戳(格式:HH:MM:SS,mmm-HH:MM:SS,mmm) - picture: 画面描述 - narration: 解说文案 """ result = await UnifiedLLMService.generate_narration_script( prompt=prompt, temperature=0.8, validate_output=True ) print(f"✅ 解说文案生成成功:") print(f" 生成了 {len(result)} 个片段") for item in result[:2]: # 只显示前2个 print(f" - {item.get('timestamp', 'N/A')}: {item.get('narration', 'N/A')[:50]}...") return True except Exception as e: print(f"❌ 解说文案生成失败: {str(e)}") return False async def test_subtitle_analysis(): """测试字幕分析功能""" print("\n📝 测试字幕分析功能...") try: subtitle_content = """ 1 00:00:01,000 --> 00:00:05,000 大家好,欢迎来到我的频道。 2 00:00:05,000 --> 00:00:10,000 今天我们要学习如何使用人工智能。 3 00:00:10,000 --> 00:00:15,000 人工智能是一项非常有趣的技术。 """ result = await UnifiedLLMService.analyze_subtitle( subtitle_content=subtitle_content, temperature=0.7, validate_output=True ) print(f"✅ 字幕分析成功:") print(f" 分析结果: {result[:100]}...") return True except Exception as e: print(f"❌ 字幕分析失败: {str(e)}") return False def test_config_validation(): """测试配置验证功能""" print("\n⚙️ 测试配置验证功能...") try: # 验证所有配置 validation_results = LLMConfigValidator.validate_all_configs() summary = validation_results["summary"] print(f"✅ 配置验证完成:") print(f" 视觉模型提供商: {summary['valid_vision_providers']}/{summary['total_vision_providers']} 有效") print(f" 文本模型提供商: {summary['valid_text_providers']}/{summary['total_text_providers']} 有效") if summary["errors"]: print(f" 发现 {len(summary['errors'])} 个错误") for error in summary["errors"][:3]: # 只显示前3个错误 print(f" - {error}") return summary['valid_text_providers'] > 0 except Exception as e: print(f"❌ 配置验证失败: {str(e)}") return False def test_provider_info(): """测试提供商信息获取""" print("\n📋 测试提供商信息获取...") try: provider_info = UnifiedLLMService.get_provider_info() vision_providers = list(provider_info["vision_providers"].keys()) text_providers = list(provider_info["text_providers"].keys()) print(f"✅ 提供商信息获取成功:") print(f" 视觉模型提供商: {', '.join(vision_providers)}") print(f" 文本模型提供商: {', '.join(text_providers)}") return True except Exception as e: print(f"❌ 提供商信息获取失败: {str(e)}") return False async def run_all_tests(): """运行所有测试""" print("🚀 开始LLM服务测试...") print("="*60) # 测试结果统计 test_results = [] # 1. 测试配置验证 test_results.append(("配置验证", test_config_validation())) # 2. 测试提供商信息 test_results.append(("提供商信息", test_provider_info())) # 3. 测试文本生成 test_results.append(("文本生成", await test_text_generation())) # 4. 测试JSON生成 test_results.append(("JSON生成", await test_json_generation())) # 5. 测试字幕分析 test_results.append(("字幕分析", await test_subtitle_analysis())) # 6. 测试解说文案生成 test_results.append(("解说文案生成", await test_narration_script_generation())) # 输出测试结果 print("\n" + "="*60) print("📊 测试结果汇总:") print("="*60) passed = 0 total = len(test_results) for test_name, result in test_results: status = "✅ 通过" if result else "❌ 失败" print(f" {test_name:<15} {status}") if result: passed += 1 print(f"\n总计: {passed}/{total} 个测试通过") if passed == total: print("🎉 所有测试通过!LLM服务工作正常。") elif passed > 0: print("⚠️ 部分测试通过,请检查失败的测试项。") else: print("💥 所有测试失败,请检查配置和网络连接。") print("="*60) if __name__ == "__main__": # 设置日志级别 logger.remove() logger.add(sys.stderr, level="INFO") # 运行测试 asyncio.run(run_all_tests()) ================================================ FILE: app/services/llm/unified_service.py ================================================ """ 统一的大模型服务接口 提供简化的API接口,方便现有代码迁移到新的架构 """ from typing import List, Dict, Any, Optional, Union from pathlib import Path import PIL.Image from loguru import logger from .manager import LLMServiceManager from .validators import OutputValidator from .exceptions import LLMServiceError # 提供商注册由 webui.py:main() 显式调用(见 LLM 提供商注册机制重构) # 这样更可靠,错误也更容易调试 class UnifiedLLMService: """统一的大模型服务接口""" @staticmethod async def analyze_images(images: List[Union[str, Path, PIL.Image.Image]], prompt: str, provider: Optional[str] = None, batch_size: int = 10, **kwargs) -> List[str]: """ 分析图片内容 Args: images: 图片路径列表或PIL图片对象列表 prompt: 分析提示词 provider: 视觉模型提供商名称,如果不指定则使用配置中的默认值 batch_size: 批处理大小 **kwargs: 其他参数 Returns: 分析结果列表 Raises: LLMServiceError: 服务调用失败时抛出 """ try: # 获取视觉模型提供商 vision_provider = LLMServiceManager.get_vision_provider(provider) # 执行图片分析 results = await vision_provider.analyze_images( images=images, prompt=prompt, batch_size=batch_size, **kwargs ) logger.info(f"图片分析完成,共处理 {len(images)} 张图片,生成 {len(results)} 个结果") return results except Exception as e: logger.error(f"图片分析失败: {str(e)}") raise LLMServiceError(f"图片分析失败: {str(e)}") @staticmethod async def generate_text(prompt: str, system_prompt: Optional[str] = None, provider: Optional[str] = None, temperature: float = 1.0, max_tokens: Optional[int] = None, response_format: Optional[str] = None, **kwargs) -> str: """ 生成文本内容 Args: prompt: 用户提示词 system_prompt: 系统提示词 provider: 文本模型提供商名称,如果不指定则使用配置中的默认值 temperature: 生成温度 max_tokens: 最大token数 response_format: 响应格式 ('json' 或 None) **kwargs: 其他参数 Returns: 生成的文本内容 Raises: LLMServiceError: 服务调用失败时抛出 """ try: # 获取文本模型提供商 text_provider = LLMServiceManager.get_text_provider(provider) # 执行文本生成 result = await text_provider.generate_text( prompt=prompt, system_prompt=system_prompt, temperature=temperature, max_tokens=max_tokens, response_format=response_format, **kwargs ) logger.info(f"文本生成完成,生成内容长度: {len(result)} 字符") return result except Exception as e: logger.error(f"文本生成失败: {str(e)}") raise LLMServiceError(f"文本生成失败: {str(e)}") @staticmethod async def generate_narration_script(prompt: str, provider: Optional[str] = None, temperature: float = 1.0, validate_output: bool = True, **kwargs) -> List[Dict[str, Any]]: """ 生成解说文案 Args: prompt: 提示词 provider: 文本模型提供商名称 temperature: 生成温度 validate_output: 是否验证输出格式 **kwargs: 其他参数 Returns: 解说文案列表 Raises: LLMServiceError: 服务调用失败时抛出 """ try: # 生成文本 result = await UnifiedLLMService.generate_text( prompt=prompt, provider=provider, temperature=temperature, response_format="json", **kwargs ) # 验证输出格式 if validate_output: narration_items = OutputValidator.validate_narration_script(result) logger.info(f"解说文案生成并验证完成,共 {len(narration_items)} 个片段") return narration_items else: # 简单的JSON解析 import json parsed_result = json.loads(result) if "items" in parsed_result: return parsed_result["items"] else: return parsed_result except Exception as e: logger.error(f"解说文案生成失败: {str(e)}") raise LLMServiceError(f"解说文案生成失败: {str(e)}") @staticmethod async def analyze_subtitle(subtitle_content: str, provider: Optional[str] = None, temperature: float = 1.0, validate_output: bool = True, **kwargs) -> str: """ 分析字幕内容 Args: subtitle_content: 字幕内容 provider: 文本模型提供商名称 temperature: 生成温度 validate_output: 是否验证输出格式 **kwargs: 其他参数 Returns: 分析结果 Raises: LLMServiceError: 服务调用失败时抛出 """ try: # 构建分析提示词 system_prompt = "你是一位专业的剧本分析师和剧情概括助手。请仔细分析字幕内容,提取关键剧情信息。" # 生成分析结果 result = await UnifiedLLMService.generate_text( prompt=subtitle_content, system_prompt=system_prompt, provider=provider, temperature=temperature, **kwargs ) # 验证输出格式 if validate_output: validated_result = OutputValidator.validate_subtitle_analysis(result) logger.info("字幕分析完成并验证通过") return validated_result else: return result except Exception as e: logger.error(f"字幕分析失败: {str(e)}") raise LLMServiceError(f"字幕分析失败: {str(e)}") @staticmethod def get_provider_info() -> Dict[str, Any]: """ 获取所有提供商信息 Returns: 提供商信息字典 """ return LLMServiceManager.get_provider_info() @staticmethod def list_vision_providers() -> List[str]: """ 列出所有视觉模型提供商 Returns: 提供商名称列表 """ return LLMServiceManager.list_vision_providers() @staticmethod def list_text_providers() -> List[str]: """ 列出所有文本模型提供商 Returns: 提供商名称列表 """ return LLMServiceManager.list_text_providers() @staticmethod def clear_cache(): """清空提供商实例缓存""" LLMServiceManager.clear_cache() logger.info("已清空大模型服务缓存") # 为了向后兼容,提供一些便捷函数 async def analyze_images_unified(images: List[Union[str, Path, PIL.Image.Image]], prompt: str, provider: Optional[str] = None, batch_size: int = 10) -> List[str]: """便捷的图片分析函数""" return await UnifiedLLMService.analyze_images(images, prompt, provider, batch_size) async def generate_text_unified(prompt: str, system_prompt: Optional[str] = None, provider: Optional[str] = None, temperature: float = 1.0, response_format: Optional[str] = None) -> str: """便捷的文本生成函数""" return await UnifiedLLMService.generate_text( prompt, system_prompt, provider, temperature, response_format=response_format ) ================================================ FILE: app/services/llm/validators.py ================================================ """ 输出格式验证器 提供严格的输出格式验证机制,确保大模型输出符合预期格式 """ import json import re from typing import Any, Dict, List, Optional, Union from loguru import logger from .exceptions import ValidationError class OutputValidator: """输出格式验证器""" @staticmethod def validate_json_output(output: str, schema: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """ 验证JSON输出格式 Args: output: 待验证的输出字符串 schema: JSON Schema (可选) Returns: 解析后的JSON对象 Raises: ValidationError: 验证失败时抛出 """ try: # 清理输出字符串,移除可能的markdown代码块标记 cleaned_output = OutputValidator._clean_json_output(output) # 解析JSON parsed_json = json.loads(cleaned_output) # 如果提供了schema,进行schema验证 if schema: OutputValidator._validate_json_schema(parsed_json, schema) return parsed_json except json.JSONDecodeError as e: logger.error(f"JSON解析失败: {str(e)}") logger.error(f"原始输出: {output}") raise ValidationError(f"JSON格式无效: {str(e)}", "json_parse", output) except Exception as e: logger.error(f"JSON验证失败: {str(e)}") raise ValidationError(f"JSON验证失败: {str(e)}", "json_validation", output) @staticmethod def _clean_json_output(output: str) -> str: """清理JSON输出,移除markdown标记等""" # 移除可能的markdown代码块标记 output = re.sub(r'^```json\s*', '', output, flags=re.MULTILINE) output = re.sub(r'^```\s*$', '', output, flags=re.MULTILINE) output = re.sub(r'^```.*$', '', output, flags=re.MULTILINE) # 移除开头和结尾的```标记 output = re.sub(r'^```', '', output) output = re.sub(r'```$', '', output) # 移除前后空白字符 output = output.strip() return output @staticmethod def _validate_json_schema(data: Dict[str, Any], schema: Dict[str, Any]): """验证JSON Schema (简化版本)""" # 这里可以集成jsonschema库进行更严格的验证 # 目前实现基础的类型检查 if "type" in schema: expected_type = schema["type"] if expected_type == "object" and not isinstance(data, dict): raise ValidationError(f"期望对象类型,实际为 {type(data)}", "schema_type") elif expected_type == "array" and not isinstance(data, list): raise ValidationError(f"期望数组类型,实际为 {type(data)}", "schema_type") if "required" in schema and isinstance(data, dict): for required_field in schema["required"]: if required_field not in data: raise ValidationError(f"缺少必需字段: {required_field}", "schema_required") @staticmethod def validate_narration_script(output: str) -> List[Dict[str, Any]]: """ 验证解说文案输出格式 Args: output: 待验证的解说文案输出 Returns: 解析后的解说文案列表 Raises: ValidationError: 验证失败时抛出 """ try: # 定义解说文案的JSON Schema narration_schema = { "type": "object", "required": ["items"], "properties": { "items": { "type": "array", "items": { "type": "object", "required": ["_id", "timestamp", "picture", "narration"], "properties": { "_id": {"type": "number"}, "timestamp": {"type": "string"}, "picture": {"type": "string"}, "narration": {"type": "string"}, "OST": {"type": "number"} } } } } } # 验证JSON格式 parsed_data = OutputValidator.validate_json_output(output, narration_schema) # 提取items数组 items = parsed_data.get("items", []) # 验证每个item的具体内容 for i, item in enumerate(items): OutputValidator._validate_narration_item(item, i) logger.info(f"解说文案验证成功,共 {len(items)} 个片段") return items except ValidationError: raise except Exception as e: logger.error(f"解说文案验证失败: {str(e)}") raise ValidationError(f"解说文案验证失败: {str(e)}", "narration_validation", output) @staticmethod def _validate_narration_item(item: Dict[str, Any], index: int): """验证单个解说文案项目""" # 验证时间戳格式 timestamp = item.get("timestamp", "") if not re.match(r'\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}', timestamp): raise ValidationError(f"第{index+1}项时间戳格式无效: {timestamp}", "timestamp_format") # 验证内容不为空 if not item.get("picture", "").strip(): raise ValidationError(f"第{index+1}项画面描述不能为空", "empty_picture") if not item.get("narration", "").strip(): raise ValidationError(f"第{index+1}项解说文案不能为空", "empty_narration") # 验证ID为正整数 item_id = item.get("_id") if not isinstance(item_id, (int, float)) or item_id <= 0: raise ValidationError(f"第{index+1}项ID必须为正整数: {item_id}", "invalid_id") @staticmethod def validate_subtitle_analysis(output: str) -> str: """ 验证字幕分析输出格式 Args: output: 待验证的字幕分析输出 Returns: 验证后的分析内容 Raises: ValidationError: 验证失败时抛出 """ try: # 基础验证:内容不能为空 if not output or not output.strip(): raise ValidationError("字幕分析结果不能为空", "empty_analysis") # 验证内容长度合理 if len(output.strip()) < 50: raise ValidationError("字幕分析结果过短,可能不完整", "analysis_too_short") # 验证是否包含基本的分析要素(可根据需要调整) analysis_keywords = ["剧情", "情节", "角色", "故事", "内容"] if not any(keyword in output for keyword in analysis_keywords): logger.warning("字幕分析结果可能缺少关键分析要素") logger.info("字幕分析验证成功") return output.strip() except ValidationError: raise except Exception as e: logger.error(f"字幕分析验证失败: {str(e)}") raise ValidationError(f"字幕分析验证失败: {str(e)}", "analysis_validation", output) ================================================ FILE: app/services/llm.py ================================================ import os import re import json import traceback import streamlit as st from typing import List from loguru import logger from openai import OpenAI from openai import AzureOpenAI from moviepy import VideoFileClip from openai.types.chat import ChatCompletion import google.generativeai as gemini from googleapiclient.errors import ResumableUploadError from google.api_core.exceptions import * from google.generativeai.types import * import subprocess from typing import Union, TextIO from app.config import config from app.utils.utils import clean_model_output _max_retries = 5 Method = """ 重要提示:每一部剧的文案,前几句必须吸引人 首先我们在看完看懂电影后,大脑里面要先有一个大概的轮廓,也就是一个类似于作文的大纲,电影主题线在哪里,首先要找到。 一般将文案分为开头、内容、结尾 ## 开头部分 文案开头三句话,是留住用户的关键! ### 方式一:开头概括总结 文案的前三句,是整部电影的概括总结,2-3句介绍后,开始叙述故事剧情! 推荐新手(新号)做:(盘点型) 盘点全球最恐怖的10部电影 盘���全球最科幻的10部电影 盘点全球最悲惨的10部电影 盘全球最值得看的10部灾难电影 盘点全球最值得看的10部励志电影 下面的示例就是最简单的解说文案开头: 1.这是XXX国20年来最大尺度的一部剧,极度烧脑,却让99%的人看得心潮澎湃、无法自拔,故事开始…… 2.这是有史以来电影院唯一一部全程开灯放完的电影,期间无数人尖叫昏厥,他被成为勇敢者的专属,因为99%的人都不敢看到结局,许多人看完它从此不愿再碰手机,他就是大名鼎鼎的暗黑神作《XXX》…… 3.这到底是一部什么样的电影,能被55个国家公开抵制,它甚至为了上映,不惜删减掉整整47分钟的剧情…… 4.是什么样的一个人被豆瓣网友称之为史上最牛P的老太太,都70岁了还要去贩毒…… 5.他是M国历史上最NB/惨/猖狂/冤枉……的囚犯/抢劫犯/…… 6.这到底是一部什么样的影片,他一个人就拿了4个顶级奖项,第一季8.7分,第二季直接干到9.5分,11万人给出5星好评,一共也就6集,却斩获26项国际大奖,看过的人都说,他是近年来最好的xxx剧,几乎成为了近年来xxx剧的标杆。故事发生在…… 7.他是国产电影的巅峰佳作,更是许多80-90后的青春启蒙,曾入选《��代》周刊,获得年度佳片第一,可在国内却被尘封多年,至今为止都无法在各大视频网站看到完整资源,他就是《xxxxxx》 8.这是一部让所有人看得荷尔蒙飙升的爽片…… 9.他被成为世界上最虐心绝望的电影,至今无人敢看第二遍,很难想象,他是根据真实事件改编而来…… 10.这大概是有史以来最令人不寒而栗的电影,当年一经放映,就点燃了无数人的怒火,不少观众不等影片放完,就愤然离场,它比《xxx》更让人绝望,比比《xxx》更让人xxx,能坚持看完全片的人,更是万中无一,包括我。甚至观影结束后,有无数人抵制投诉这部电影,认为影片的导演玩弄了他们的情感!他是顶级神作《xxxx》…… 11.这是X国有史以来最高赞的一部悬疑电影,然而却因为某些原因,国内90%的人,没能看过这部片子,他就是《xxx》…… 12.有这样一部电影,这辈子,你绝对不想再看第二遍,并不是它剧情烂俗,而是它的结局你根本承受不起/想象不到……甚至有80%的观众在观影途中情绪崩溃中途离场,更让许多同行都不想解说这部电影,他就是大名鼎鼎的暗黑神作《xxx》… 13.它被誉为史上最牛悬疑片无数人在看完它时候,一个月不敢照镜��,这样一部仅适合部分年龄段观看的影片,究竟有什么样的魅力,竟然获得某瓣8.2的高分,很多人说这部电影到处都是看点,他就是《xxx》…. 14.这是一部在某瓣上被70万人打出9.3分的高分的电影……到底是一部什么样的电影,能够在某瓣上被70万人打出9.3分的高分…… 15.这是一部细思极恐的科幻大片,整部电影颠覆你的三观,它的名字叫…… 16.史上最震撼的灾难片,每一点都不舍得快进的电影,他叫…… 17.今天给大家带来一部基于真实事件改编的(主题介绍一句……)的故事片,这是一部连环悬疑剧,如果不看到最后绝对想不到结局竟然是这样的反转…… ### 方式:情景式、假设性开头 1.他叫……你以为他是……的吗?不。他是来……然后开始叙述 2.你知道……吗?原来……然后开始叙述 3.如果给你….,你会怎么样? 4.如果你是….,你会怎么样? ### 方式三:以国家为开头!简单明了。话语不需要多,但是需要讲解透彻! 1.这是一部韩国最新灾难片,你一定没有看过…… 2.这是一部印度高分悬疑片, 3.这部电影原在日本因为……而被下架, 4.这是韩国最恐怖的犯罪片, 5.这是最近国产片评分最高的悬疑�� 以上均按照影片国家来区分,然后简单介绍下主题。就可以开始直接叙述作品。也是一个很不错的方法! ### 方式四:如何自由发挥 正常情况下,每一部电影都有非常关键的一个大纲,这部电影的主题其实是可以用一句话、两句话概括的。只要看懂电影,就能找到这个主题大纲。 我们提前把这个主题大纲给放到影视最前面,作为我们的前三句的文案,将会非常吸引人! 例如: 1.这不是电影,这是真实故事。两个女人和一个男人被关在可桑拿室。喊破喉咙也没有一丝回音。窒息感和热度让人抓狂,故事就是从这里开始! 2.如果你男朋友出轨了,他不爱你了,还你家暴,怎么办?接下来这部电影就会教你如何让老公服服帖帖的呆在你身边!女主是一个……开始叙述了。 3.他力大无穷,双眼放光,这不是拯救地球的超人吗?然而不是。今天给大家推荐的这部电影叫…… 以上是需要看完影片,看懂影片,然后从里面提炼出精彩的几句话,当然是比较难的,当你不会自己去总结前三句的经典的话。可以用前面方式一二三! 实在想不出来如何去提炼,可以去搜索这部剧,对这部电影的影评,也会给你带过来很多灵感的! ## 内容部分 开头有了,剩下的就是开始叙述正文了。主题介绍是根据影片内容来介绍,如果实在自己想不出来。可以参考其他平台中对这部电影的精彩介绍,提取2-3句也可以! 正常情况下,我们叙述的时候其实是非常简单的,把整部电影主题线,叙述下来,其实文案就是加些修饰词把电影重点内容叙述下来。加上一些修饰词。 以悬疑剧为例: 竟然,突然,原来,但是,但,可是,结果,直到,如果,而,果然,发现,只是,出奇,之后,没错,不止,更是,当然,因为,所以……等! 以上是比较常用的,当然还有很多,需要靠平时思考和阅读的积累!因悬疑剧会有多处反转剧情。所以需要用到反转的修饰词比较多,只有用到这些词。才能体现出各种反转剧情! 建议大家在刚开始做的时候,做8分钟内的,不要太长,分成三段。每段也是不超过三分钟,这样时间刚好。可以比较好的完成完播率! ## 结尾部分 最后故事的结局,除了反转,可以来点人生的道理!如果刚开始不会,可以不写。 后面水平越来越高的时候,可以进行人生道理的讲评。 比如:这部电影告诉我们…… 类似于哲理性质��作为一个总结! 也可以把最后的影视反转,原生放出来,留下悬念。 比如:也可以总结下这部短片如何的好,推荐/值得大家去观看之类的话语。 其实就是给我们的作品来一个总结,总结我们所做的三个视频,有开始就要有结束。这个结束不一定是固定的模版。但是视频一定要有结尾。让人感觉有头有尾才最舒服! 做解说第一次,可能会做两天。第二次可能就需要一天了。慢慢的。时间缩短到8个小时之内是我们平的制作全部时间! """ def handle_exception(err): if isinstance(err, PermissionDenied): raise Exception("403 用户没有权限访问该资源") elif isinstance(err, ResourceExhausted): raise Exception("429 您的配额已用尽。请稍后重试。请考虑设置自动重试来处理这些错误") elif isinstance(err, InvalidArgument): raise Exception("400 参数无效。例如,文件过大,超出了载荷大小限制。另一个事件提供了无效的 API 密钥。") elif isinstance(err, AlreadyExists): raise Exception("409 已存在具有相同 ID 的已调参模型。对新模型进行调参时,请指定唯一的模型 ID。") elif isinstance(err, RetryError): raise Exception("使用不支持 gRPC 的代理时可能会引起此错误。请尝试将 REST 传输与 genai.configure(..., transport=rest) 搭配使用。") elif isinstance(err, BlockedPromptException): raise Exception("400 出于安全原因,该提示已被屏蔽。") elif isinstance(err, BrokenResponseError): raise Exception("500 流式传输响应已损坏。在访问需要完整响应的内容(例如聊天记录)时引发。查看堆栈轨迹中提供的错误详情。") elif isinstance(err, IncompleteIterationError): raise Exception("500 访问需要完整 API 响应但流式响应尚未完全迭代的内容时引发。对响应对象调用 resolve() 以使用迭代器。") elif isinstance(err, ConnectionError): raise Exception("网络连接错误, 请检查您的网络连接(建议使用 NarratoAI 官方提供的 url)") else: raise Exception(f"大模型请求失败, 下面是具体报错信息: \n\n{traceback.format_exc()}") def _generate_response(prompt: str, llm_provider: str = None) -> str: """ 调用大模型通用方法 prompt: llm_provider: """ content = "" if not llm_provider: llm_provider = config.app.get("llm_provider", "openai") logger.info(f"llm provider: {llm_provider}") if llm_provider == "g4f": model_name = config.app.get("g4f_model_name", "") if not model_name: model_name = "gpt-3.5-turbo-16k-0613" import g4f content = g4f.ChatCompletion.create( model=model_name, messages=[{"role": "user", "content": prompt}], ) else: api_version = "" # for azure if llm_provider == "moonshot": api_key = config.app.get("moonshot_api_key") model_name = config.app.get("moonshot_model_name") base_url = "https://api.moonshot.cn/v1" elif llm_provider == "ollama": # api_key = config.app.get("openai_api_key") api_key = "ollama" # any string works but you are required to have one model_name = config.app.get("ollama_model_name") base_url = config.app.get("ollama_base_url", "") if not base_url: base_url = "http://localhost:11434/v1" elif llm_provider == "openai": api_key = config.app.get("openai_api_key") model_name = config.app.get("openai_model_name") base_url = config.app.get("openai_base_url", "") if not base_url: base_url = "https://api.openai.com/v1" elif llm_provider == "oneapi": api_key = config.app.get("oneapi_api_key") model_name = config.app.get("oneapi_model_name") base_url = config.app.get("oneapi_base_url", "") elif llm_provider == "azure": api_key = config.app.get("azure_api_key") model_name = config.app.get("azure_model_name") base_url = config.app.get("azure_base_url", "") api_version = config.app.get("azure_api_version", "2024-02-15-preview") elif llm_provider == "gemini": api_key = config.app.get("gemini_api_key") model_name = config.app.get("gemini_model_name") base_url = "***" elif llm_provider == "qwen": api_key = config.app.get("qwen_api_key") model_name = config.app.get("qwen_model_name") base_url = "***" elif llm_provider == "cloudflare": api_key = config.app.get("cloudflare_api_key") model_name = config.app.get("cloudflare_model_name") account_id = config.app.get("cloudflare_account_id") base_url = "***" elif llm_provider == "deepseek": api_key = config.app.get("deepseek_api_key") model_name = config.app.get("deepseek_model_name") base_url = config.app.get("deepseek_base_url") if not base_url: base_url = "https://api.deepseek.com" elif llm_provider == "ernie": api_key = config.app.get("ernie_api_key") secret_key = config.app.get("ernie_secret_key") base_url = config.app.get("ernie_base_url") model_name = "***" if not secret_key: raise ValueError( f"{llm_provider}: secret_key is not set, please set it in the config.toml file." ) else: raise ValueError( "llm_provider is not set, please set it in the config.toml file." ) if not api_key: raise ValueError( f"{llm_provider}: api_key is not set, please set it in the config.toml file." ) if not model_name: raise ValueError( f"{llm_provider}: model_name is not set, please set it in the config.toml file." ) if not base_url: raise ValueError( f"{llm_provider}: base_url is not set, please set it in the config.toml file." ) if llm_provider == "qwen": import dashscope from dashscope.api_entities.dashscope_response import GenerationResponse dashscope.api_key = api_key response = dashscope.Generation.call( model=model_name, messages=[{"role": "user", "content": prompt}] ) if response: if isinstance(response, GenerationResponse): status_code = response.status_code if status_code != 200: raise Exception( f'[{llm_provider}] returned an error response: "{response}"' ) content = response["output"]["text"] return content.replace("\n", "") else: raise Exception( f'[{llm_provider}] returned an invalid response: "{response}"' ) else: raise Exception(f"[{llm_provider}] returned an empty response") if llm_provider == "gemini": import google.generativeai as genai genai.configure(api_key=api_key, transport="rest") safety_settings = { HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, } model = genai.GenerativeModel( model_name=model_name, safety_settings=safety_settings, ) try: response = model.generate_content(prompt) return response.text except Exception as err: return handle_exception(err) if llm_provider == "cloudflare": import requests response = requests.post( f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}", headers={"Authorization": f"Bearer {api_key}"}, json={ "messages": [ {"role": "system", "content": "You are a friendly assistant"}, {"role": "user", "content": prompt}, ] }, ) result = response.json() logger.info(result) return result["result"]["response"] if llm_provider == "ernie": import requests params = { "grant_type": "client_credentials", "client_id": api_key, "client_secret": secret_key, } access_token = ( requests.post("https://aip.baidubce.com/oauth/2.0/token", params=params) .json() .get("access_token") ) url = f"{base_url}?access_token={access_token}" payload = json.dumps( { "messages": [{"role": "user", "content": prompt}], "temperature": 0.5, "top_p": 0.8, "penalty_score": 1, "disable_search": False, "enable_citation": False, "response_format": "text", } ) headers = {"Content-Type": "application/json"} response = requests.request( "POST", url, headers=headers, data=payload ).json() return response.get("result") if llm_provider == "azure": client = AzureOpenAI( api_key=api_key, api_version=api_version, azure_endpoint=base_url, ) else: client = OpenAI( api_key=api_key, base_url=base_url, ) response = client.chat.completions.create( model=model_name, messages=[{"role": "user", "content": prompt}] ) if response: if isinstance(response, ChatCompletion): content = response.choices[0].message.content else: raise Exception( f'[{llm_provider}] returned an invalid response: "{response}", please check your network ' f"connection and try again." ) else: raise Exception( f"[{llm_provider}] returned an empty response, please check your network connection and try again." ) return content.replace("\n", "") def _generate_response_video(prompt: str, llm_provider_video: str, video_file: Union[str, TextIO]) -> str: """ 多模态能力大模型 """ if llm_provider_video == "gemini": api_key = config.app.get("gemini_api_key") model_name = config.app.get("gemini_model_name") base_url = "***" else: raise ValueError( "llm_provider 未设置,请在 config.toml 文件中进行设置。" ) if llm_provider_video == "gemini": import google.generativeai as genai genai.configure(api_key=api_key, transport="rest") safety_settings = { HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, } model = genai.GenerativeModel( model_name=model_name, safety_settings=safety_settings, ) try: response = model.generate_content([prompt, video_file]) return response.text except Exception as err: return handle_exception(err) def compress_video(input_path: str, output_path: str): """ 压缩视频文件 Args: input_path: 输入视频文件路径 output_path: 输出压缩后的视频文件路径 """ # 如果压缩后的视频文件已经存在,则直接使用 if os.path.exists(output_path): logger.info(f"压缩视频文件已存在: {output_path}") return try: clip = VideoFileClip(input_path) clip.write_videofile(output_path, codec='libx264', audio_codec='aac', bitrate="500k", audio_bitrate="128k") except subprocess.CalledProcessError as e: logger.error(f"视频压缩失败: {e}") raise def generate_script( video_path: str, video_plot: str, video_name: str, language: str = "zh-CN", progress_callback=None ) -> str: """ 生成视频剪辑脚本 Args: video_path: 视频文件路径 video_plot: 视频剧情内容 video_name: 视频名称 language: 语言 progress_callback: 进度回调函数 Returns: str: 生成的脚本 """ try: # 1. 压缩视频 compressed_video_path = f"{os.path.splitext(video_path)[0]}_compressed.mp4" compress_video(video_path, compressed_video_path) # 在关键步骤更新进度 if progress_callback: progress_callback(15, "压缩完成") # 例如,在压缩视频后 # 2. 转录视频 transcription = gemini_video_transcription( video_name=video_name, video_path=compressed_video_path, language=language, llm_provider_video=config.app["video_llm_provider"], progress_callback=progress_callback ) if progress_callback: progress_callback(60, "生成解说文案...") # 例如,在转录视频后 # 3. 编写解说文案 script = writing_short_play(video_plot, video_name, config.app["llm_provider"], count=300) # 在关键步骤更新进度 if progress_callback: progress_callback(70, "匹配画面...") # 例如,在生成脚本后 # 4. 文案匹配画面 if transcription != "": matched_script = screen_matching(huamian=transcription, wenan=script, llm_provider=config.app["video_llm_provider"]) # 在关键步骤更新进度 if progress_callback: progress_callback(80, "匹配成功") return matched_script else: return "" except Exception as e: handle_exception(e) raise def gemini_video_transcription(video_name: str, video_path: str, language: str, llm_provider_video: str, progress_callback=None): ''' 使用 gemini-1.5-xxx 进行视频画面转录 ''' api_key = config.app.get("gemini_api_key") gemini.configure(api_key=api_key) prompt = """ 请转录音频,包括时间戳,并提供视觉描述,然后以 JSON 格式输出,当前视频中使用的语言为 %s。 在转录视频时,请通过确保以下条件来完成转录: 1. 画面描述使用语言: %s 进行输出。 2. 同一个画面合并为一个转录记录。 3. 使用以下 JSON schema: Graphics = {"timestamp": "MM:SS-MM:SS"(时间戳格式), "picture": "str"(画面描述), "speech": "str"(台词,如果没有人说话,则使用空字符串。)} Return: list[Graphics] 4. 请以严格的 JSON 格式返回数据,不要包含任何注释、标记或其他字符。数据应符合 JSON 语法,可以被 json.loads() 函数直接解析, 不要添加 ```json 或其他标记。 """ % (language, language) logger.debug(f"视频名称: {video_name}") try: if progress_callback: progress_callback(20, "上传视频至 Google cloud") gemini_video_file = gemini.upload_file(video_path) logger.debug(f"视频 {gemini_video_file.name} 上传至 Google cloud 成功, 开始解析...") while gemini_video_file.state.name == "PROCESSING": gemini_video_file = gemini.get_file(gemini_video_file.name) if progress_callback: progress_callback(30, "上传成功, 开始解析") # 更新进度为20% if gemini_video_file.state.name == "FAILED": raise ValueError(gemini_video_file.state.name) elif gemini_video_file.state.name == "ACTIVE": if progress_callback: progress_callback(40, "解析完成, 开始转录...") # 更新进度为30% logger.debug("解析完成, 开始转录...") except ResumableUploadError as err: logger.error(f"上传视频至 Google cloud 失败, 用户的位置信息不支持用于该API; \n{traceback.format_exc()}") return False except FailedPrecondition as err: logger.error(f"400 用户位置不支持 Google API 使用。\n{traceback.format_exc()}") return False if progress_callback: progress_callback(50, "开始转录") try: response = _generate_response_video(prompt=prompt, llm_provider_video=llm_provider_video, video_file=gemini_video_file) logger.success("视频转录成功") logger.debug(response) print(type(response)) return response except Exception as err: return handle_exception(err) def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]: prompt = f""" # Role: Video Search Terms Generator ## Goals: Generate {amount} search terms for stock videos, depending on the subject of a video. ## Constrains: 1. the search terms are to be returned as a json-array of strings. 2. each search term should consist of 1-3 words, always add the main subject of the video. 3. you must only return the json-array of strings. you must not return anything else. you must not return the script. 4. the search terms must be related to the subject of the video. 5. reply with english search terms only. ## Output Example: ["search term 1", "search term 2", "search term 3","search term 4","search term 5"] ## Context: ### Video Subject {video_subject} ### Video Script {video_script} Please note that you must use English for generating video search terms; Chinese is not accepted. """.strip() logger.info(f"subject: {video_subject}") search_terms = [] response = "" for i in range(_max_retries): try: response = _generate_response(prompt) search_terms = json.loads(response) if not isinstance(search_terms, list) or not all( isinstance(term, str) for term in search_terms ): logger.error("response is not a list of strings.") continue except Exception as e: logger.warning(f"failed to generate video terms: {str(e)}") if response: match = re.search(r"\[.*]", response) if match: try: search_terms = json.loads(match.group()) except Exception as e: logger.warning(f"failed to generate video terms: {str(e)}") pass if search_terms and len(search_terms) > 0: break if i < _max_retries: logger.warning(f"failed to generate video terms, trying again... {i + 1}") logger.success(f"completed: \n{search_terms}") return search_terms def gemini_video2json(video_origin_name: str, video_origin_path: str, video_plot: str, language: str) -> str: ''' 使用 gemini-1.5-pro 进行影视解析 Args: video_origin_name: str - 影视作品的原始名称 video_origin_path: str - 影视作品的原始路径 video_plot: str - 影视作品的简介或剧情概述 Return: str - 解析后的 JSON 格式字符串 ''' api_key = config.app.get("gemini_api_key") model_name = config.app.get("gemini_model_name") gemini.configure(api_key=api_key) model = gemini.GenerativeModel(model_name=model_name) prompt = """ **角色设定:** 你是一位影视解说专家,擅长根据剧情生成引人入胜的短视频解说文案,特别熟悉适用于TikTok/抖音风格的快速、抓人视频解说。 **任务目标:** 1. 根据给定剧情,详细描述画面,重点突出重要场景和情节。 2. 生成符合TikTok/抖音风格的解说,节奏紧凑,语言简洁,吸引观众。 3. 解说的时候需要解说一段播放一段原视频,原视频一般为有台词的片段,原视频的控制有 OST 字段控制。 4. 结果输出为JSON格式,包含字段: - "picture":画面描述 - "timestamp":画面出现的时间范围 - "narration":解说内容 - "OST": 是否开启原声(true / false) **输入示例:** ```text 在一个���暗的小巷中,主角缓慢走进,四周静谧无声,只有远处隐隐传来猫的叫声。突然,背后出现一个神秘的身影。 ``` **输出格式:** ```json [ { "picture": "黑暗的小巷,主角缓慢走入,四周安静,远处传来猫叫声。", "timestamp": "00:00-00:17", "narration": "静谧的小巷里,主角步步前行,气氛渐渐变得压抑。" "OST": False }, { "picture": "神秘身影突然出现,紧张气氛加剧。", "timestamp": "00:17-00:39", "narration": "原声播放" "OST": True } ] ``` **提示:** - 文案要简短有力,契合短视频平台用户的观赏习惯。 - 保持强烈的悬念和情感代入,吸引观众继续观看。 - 解说一段后播放一段原声,原声内容尽量和解说匹配。 - 文案语言为:%s - 剧情内容:%s (为空则忽略) """ % (language, video_plot) logger.debug(f"视频名称: {video_origin_name}") # try: gemini_video_file = gemini.upload_file(video_origin_path) logger.debug(f"上传视频至 Google cloud 成功: {gemini_video_file.name}") while gemini_video_file.state.name == "PROCESSING": import time time.sleep(1) gemini_video_file = gemini.get_file(gemini_video_file.name) logger.debug(f"视频当前状态(ACTIVE才可用): {gemini_video_file.state.name}") if gemini_video_file.state.name == "FAILED": raise ValueError(gemini_video_file.state.name) # except Exception as err: # logger.error(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确 \n{traceback.format_exc()}") # raise TimeoutError(f"上传视频至 Google cloud 失败, 请检查 VPN 配置和 APIKey 是否正确; {err}") streams = model.generate_content([prompt, gemini_video_file], stream=True) response = [] for chunk in streams: response.append(chunk.text) response = "".join(response) logger.success(f"llm response: \n{response}") return response def writing_movie(video_plot, video_name, llm_provider): """ 影视解说(电影解说) """ prompt = f""" **角色设定:** 你是一名有10年经验的影视解说文案的创作者, 下面是关于如何写解说文案的方法 {Method},请认真阅读它,之后我会给你一部影视作品的名称,然后让你写一篇文案 请根据方法撰写 《{video_name}》的影视解说文案,《{video_name}》的大致剧情如下: {video_plot} 文案要符合以下要求: **任务目标:** 1. 文案字数在 1500字左右,严格要求字数,最低不得少于 1000字。 2. 避免使用 markdown 格式输出文案。 3. 仅输出解说文案,不输出任何其他内容。 4. 不要包含小标题,每个段落以 \n 进行分隔。 """ try: response = _generate_response(prompt, llm_provider) logger.success("解说文案生成成功") return response except Exception as err: return handle_exception(err) def writing_short_play(video_plot: str, video_name: str, llm_provider: str, count: int = 500): """ 影视解说(短剧解说) """ if not video_plot: raise ValueError("短剧的简介不能为空") if not video_name: raise ValueError("短剧名称不能为空") prompt = f""" **角色设定:** 你是一名有10年经验的短剧解说文案的创作者, 下面是关于如何写解说文案的方法 {Method},请认真阅读它,之后我会给你一部短剧作品的简介,然后让你写一篇解说文案 请根据方法撰写 《{video_name}》的解说文案,《{video_name}》的大致剧情如下: {video_plot} 文案要符合以下要求: **任务目标:** 1. 请严格要求文案字数, 字数控制在 {count} 字左右。 2. 避免使用 markdown 格式输出文案。 3. 仅输出解说文案,不输出任何其他内容。 4. 不要包含小标题,每个段落以 \\n 进行分隔。 """ try: response = _generate_response(prompt, llm_provider) logger.success("解说文案生成成功") logger.debug(response) return response except Exception as err: return handle_exception(err) def screen_matching(huamian: str, wenan: str, llm_provider: str): """ 画面匹配(一次性匹配) """ if not huamian: raise ValueError("画面不能为空") if not wenan: raise ValueError("文案不能为空") prompt = """ 你是一名有10年经验的影视解说创作者, 你的任务是根据视频转录脚本和解说文案,匹配出每段解说文案对应的画面时间戳, 结果以 json 格式输出。 注意: 转录脚本中 - timestamp: 表示视频时间戳 - picture: 表示当前画面描述 - speech": 表示当前视频中人物的台词 转录脚本和文案(由 XML 标记分隔)如下所示: %s %s 在匹配的过程中,请通过确保以下条件来完成匹配: - 使用以下 JSON schema: script = {'picture': str, 'timestamp': str(时间戳), "narration": str, "OST": bool(是否开启原声)} Return: list[script] - picture: 字段表示当前画面描述,与转录脚本保持一致 - timestamp: 字段表示某一段文案对应的画面的时间戳,不必和转录脚本的时间戳一致,应该充分考虑文案内容,匹配出与其描述最匹配的时间戳 - 请注意,请严格的执行已经出现的画面不能重复出现,即生成的脚本中 timestamp 不能有重叠的部分。 - narration: 字段表示需要解说文案,每段解说文案尽量不要超过30字 - OST: 字段表示是否开启原声,即当 OST 字段为 true 时,narration 字段为空字符串,当 OST 为 false 时,narration 字段为对应的解说文案 - 注意,在画面匹配的过程中,需要适当的加入原声播放,使得解说和画面更加匹配,请按照 1:1 的比例,生成原声和解说的脚本内容。 - 注意,在时间戳匹配上,一定不能原样照搬“转录脚本”,应当适当的合并或者删减一些片段。 - 注意,第一个画面一定是原声播放并且时长不少于 20 s,为了吸引观众,第一段一定是整个转录脚本中最精彩的片段。 - 请以严格的 JSON 格式返回数据,不要包含任何注释、标记或其他字符。数据应符合 JSON 语法,可以被 json.loads() 函数直接解析, 不要添加 ```json 或其他标记。 """ % (huamian, wenan) try: response = _generate_response(prompt, llm_provider) logger.success("匹配成功") logger.debug(response) return response except Exception as err: return handle_exception(err) if __name__ == "__main__": # 1. 视频转录 video_subject = "第二十条之无罪释放" video_path = "/Users/apple/Desktop/home/pipedream_project/downloads/jianzao.mp4" language = "zh-CN" gemini_video_transcription( video_name=video_subject, video_path=video_path, language=language, progress_callback=print, llm_provider_video="gemini" ) # # 2. 解说文案 # video_path = "/Users/apple/Desktop/home/NarratoAI/resource/videos/1.mp4" # # video_path = "E:\\projects\\NarratoAI\\resource\\videos\\1.mp4" # video_plot = """ # 李自忠拿着儿子李牧名下的存折,去银行取钱给儿子救命,却被要求证明"你儿子是你儿子"。 # 走投无路时碰到银行被抢劫,劫匪给了他两沓钱救命,李自忠却因此被银行以抢劫罪起诉,并顶格判处20年有期徒刑。 # 苏醒后的李牧坚决为父亲做无罪辩护,面对银行的顶级律师团队,他一个法学院大一学生,能否力挽狂澜,创作奇迹?挥法律之利剑 ,持正义之天平! # """ # res = generate_script(video_path, video_plot, video_name="第二十条之无罪释放") # # res = generate_script(video_path, video_plot, video_name="海岸") # print("脚本生成成功:\n", res) # res = clean_model_output(res) # aaa = json.loads(res) # print(json.dumps(aaa, indent=2, ensure_ascii=False)) ================================================ FILE: app/services/material.py ================================================ import os import subprocess import random import traceback from urllib.parse import urlencode from datetime import datetime import json import requests from typing import List, Optional from loguru import logger from moviepy.video.io.VideoFileClip import VideoFileClip from app.config import config from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo from app.utils import utils from app.utils import ffmpeg_utils requested_count = 0 def get_api_key(cfg_key: str): api_keys = config.app.get(cfg_key) if not api_keys: raise ValueError( f"\n\n##### {cfg_key} is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n" f"{utils.to_json(config.app)}" ) # if only one key is provided, return it if isinstance(api_keys, str): return api_keys global requested_count requested_count += 1 return api_keys[requested_count % len(api_keys)] def search_videos_pexels( search_term: str, minimum_duration: int, video_aspect: VideoAspect = VideoAspect.portrait, ) -> List[MaterialInfo]: aspect = VideoAspect(video_aspect) video_orientation = aspect.name video_width, video_height = aspect.to_resolution() api_key = get_api_key("pexels_api_keys") headers = {"Authorization": api_key} # Build URL params = {"query": search_term, "per_page": 20, "orientation": video_orientation} query_url = f"https://api.pexels.com/videos/search?{urlencode(params)}" logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}") try: r = requests.get( query_url, headers=headers, proxies=config.proxy, verify=False, timeout=(30, 60), ) response = r.json() video_items = [] if "videos" not in response: logger.error(f"search videos failed: {response}") return video_items videos = response["videos"] # loop through each video in the result for v in videos: duration = v["duration"] # check if video has desired minimum duration if duration < minimum_duration: continue video_files = v["video_files"] # loop through each url to determine the best quality for video in video_files: w = int(video["width"]) h = int(video["height"]) if w == video_width and h == video_height: item = MaterialInfo() item.provider = "pexels" item.url = video["link"] item.duration = duration video_items.append(item) break return video_items except Exception as e: logger.error(f"search videos failed: {str(e)}") return [] def search_videos_pixabay( search_term: str, minimum_duration: int, video_aspect: VideoAspect = VideoAspect.portrait, ) -> List[MaterialInfo]: aspect = VideoAspect(video_aspect) video_width, video_height = aspect.to_resolution() api_key = get_api_key("pixabay_api_keys") # Build URL params = { "q": search_term, "video_type": "all", # Accepted values: "all", "film", "animation" "per_page": 50, "key": api_key, } query_url = f"https://pixabay.com/api/videos/?{urlencode(params)}" logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}") try: r = requests.get( query_url, proxies=config.proxy, verify=False, timeout=(30, 60) ) response = r.json() video_items = [] if "hits" not in response: logger.error(f"search videos failed: {response}") return video_items videos = response["hits"] # loop through each video in the result for v in videos: duration = v["duration"] # check if video has desired minimum duration if duration < minimum_duration: continue video_files = v["videos"] # loop through each url to determine the best quality for video_type in video_files: video = video_files[video_type] w = int(video["width"]) h = int(video["height"]) if w >= video_width: item = MaterialInfo() item.provider = "pixabay" item.url = video["url"] item.duration = duration video_items.append(item) break return video_items except Exception as e: logger.error(f"search videos failed: {str(e)}") return [] def save_video(video_url: str, save_dir: str = "") -> str: if not save_dir: save_dir = utils.storage_dir("cache_videos") if not os.path.exists(save_dir): os.makedirs(save_dir) url_without_query = video_url.split("?")[0] url_hash = utils.md5(url_without_query) video_id = f"vid-{url_hash}" video_path = f"{save_dir}/{video_id}.mp4" # if video already exists, return the path if os.path.exists(video_path) and os.path.getsize(video_path) > 0: logger.info(f"video already exists: {video_path}") return video_path # if video does not exist, download it with open(video_path, "wb") as f: f.write( requests.get( video_url, proxies=config.proxy, verify=False, timeout=(60, 240) ).content ) if os.path.exists(video_path) and os.path.getsize(video_path) > 0: try: clip = VideoFileClip(video_path) duration = clip.duration fps = clip.fps clip.close() if duration > 0 and fps > 0: return video_path except Exception as e: try: os.remove(video_path) except Exception as e: logger.warning(f"无效的视频文件: {video_path} => {str(e)}") return "" def download_videos( task_id: str, search_terms: List[str], source: str = "pexels", video_aspect: VideoAspect = VideoAspect.portrait, video_contact_mode: VideoConcatMode = VideoConcatMode.random, audio_duration: float = 0.0, max_clip_duration: int = 5, ) -> List[str]: valid_video_items = [] valid_video_urls = [] found_duration = 0.0 search_videos = search_videos_pexels if source == "pixabay": search_videos = search_videos_pixabay for search_term in search_terms: video_items = search_videos( search_term=search_term, minimum_duration=max_clip_duration, video_aspect=video_aspect, ) logger.info(f"found {len(video_items)} videos for '{search_term}'") for item in video_items: if item.url not in valid_video_urls: valid_video_items.append(item) valid_video_urls.append(item.url) found_duration += item.duration logger.info( f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds" ) video_paths = [] material_directory = config.app.get("material_directory", "").strip() if material_directory == "task": material_directory = utils.task_dir(task_id) elif material_directory and not os.path.isdir(material_directory): material_directory = "" if video_contact_mode.value == VideoConcatMode.random.value: random.shuffle(valid_video_items) total_duration = 0.0 for item in valid_video_items: try: logger.info(f"downloading video: {item.url}") saved_video_path = save_video( video_url=item.url, save_dir=material_directory ) if saved_video_path: logger.info(f"video saved: {saved_video_path}") video_paths.append(saved_video_path) seconds = min(max_clip_duration, item.duration) total_duration += seconds if total_duration > audio_duration: logger.info( f"total duration of downloaded videos: {total_duration} seconds, skip downloading more" ) break except Exception as e: logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}") logger.success(f"downloaded {len(video_paths)} videos") return video_paths def time_to_seconds(time_str: str) -> float: """ 将时间字符串转换为秒数 支持格式: 'HH:MM:SS,mmm' (时:分:秒,毫秒) Args: time_str: 时间字符串,如 "00:00:20,100" Returns: float: 转换后的秒数(包含毫秒) """ try: # 处理毫秒部分 if ',' in time_str: time_part, ms_part = time_str.split(',') ms = int(ms_part) / 1000 else: time_part = time_str ms = 0 # 处理时分秒 parts = time_part.split(':') if len(parts) == 3: # HH:MM:SS h, m, s = map(int, parts) seconds = h * 3600 + m * 60 + s else: raise ValueError("时间格式必须为 HH:MM:SS,mmm") return seconds + ms except ValueError as e: logger.error(f"时间格式错误: {time_str}") raise ValueError(f"时间格式错误: 必须为 HH:MM:SS,mmm 格式") from e def format_timestamp(seconds: float) -> str: """ 将秒数转换为可读的时间格式 (HH:MM:SS,mmm) Args: seconds: 秒数(可包含毫秒) Returns: str: 格式化的时间字符串,如 "00:00:20,100" """ hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) seconds_remain = seconds % 60 whole_seconds = int(seconds_remain) milliseconds = int((seconds_remain - whole_seconds) * 1000) return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" def _detect_hardware_acceleration() -> Optional[str]: """ 检测系统可用的硬件加速器 Returns: Optional[str]: 硬件加速参数,如果不支持则返回None """ # 使用集中式硬件加速检测 hwaccel_type = ffmpeg_utils.get_ffmpeg_hwaccel_type() return hwaccel_type def save_clip_video(timestamp: str, origin_video: str, save_dir: str = "") -> str: """ 保存剪辑后的视频 Args: timestamp: 需要裁剪的时间戳,格式为 'HH:MM:SS,mmm-HH:MM:SS,mmm' 例如: '00:00:00,000-00:00:20,100' origin_video: 原视频路径 save_dir: 存储目录 Returns: dict: 裁剪后的视频路径,格式为 {timestamp: video_path} """ # 使用新的路径结构 if not save_dir: base_dir = os.path.join(utils.temp_dir(), "clip_video") video_hash = utils.md5(origin_video) save_dir = os.path.join(base_dir, video_hash) if not os.path.exists(save_dir): os.makedirs(save_dir) # 解析时间戳 start_str, end_str = timestamp.split('-') # 格式化输出文件名(使用连字符替代冒号和逗号) safe_start_time = start_str.replace(':', '-').replace(',', '-') safe_end_time = end_str.replace(':', '-').replace(',', '-') output_filename = f"vid_{safe_start_time}@{safe_end_time}.mp4" video_path = os.path.join(save_dir, output_filename) # 如果视频已存在,直接返回 if os.path.exists(video_path) and os.path.getsize(video_path) > 0: logger.info(f"视频已存在: {video_path}") return video_path try: # 检查视频是否存在 if not os.path.exists(origin_video): logger.error(f"源视频文件不存在: {origin_video}") return '' # 获取视频总时长 try: probe_cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", origin_video] total_duration = float(subprocess.check_output(probe_cmd).decode('utf-8').strip()) except subprocess.CalledProcessError as e: logger.error(f"获取视频时长失败: {str(e)}") return '' # 计算时间点 start = time_to_seconds(start_str) end = time_to_seconds(end_str) # 验证时间段 if start >= total_duration: logger.warning(f"起始时间 {format_timestamp(start)} ({start:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒)") return '' if end > total_duration: logger.warning(f"结束时间 {format_timestamp(end)} ({end:.3f}秒) 超出视频总时长 {format_timestamp(total_duration)} ({total_duration:.3f}秒),将自动调整为视频结尾") end = total_duration if end <= start: logger.warning(f"结束时间 {format_timestamp(end)} 必须大于起始时间 {format_timestamp(start)}") return '' # 计算剪辑时长 duration = end - start # logger.info(f"开始剪辑视频: {format_timestamp(start)} - {format_timestamp(end)},时长 {format_timestamp(duration)}") # 获取硬件加速选项 hwaccel = _detect_hardware_acceleration() hwaccel_args = [] if hwaccel: hwaccel_args = ffmpeg_utils.get_ffmpeg_hwaccel_args() # 转换为FFmpeg兼容的时间格式(逗号替换为点) ffmpeg_start_time = start_str.replace(',', '.') ffmpeg_end_time = end_str.replace(',', '.') # 构建FFmpeg命令 - 使用新的智能编码器选择 encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder() ffmpeg_cmd = [ "ffmpeg", "-y", *hwaccel_args, "-i", origin_video, "-ss", ffmpeg_start_time, "-to", ffmpeg_end_time, "-c:v", encoder, "-c:a", "aac", "-strict", "experimental", video_path ] # 根据编码器类型添加特定参数 if "nvenc" in encoder: ffmpeg_cmd.insert(-1, "-preset") ffmpeg_cmd.insert(-1, "medium") elif "videotoolbox" in encoder: ffmpeg_cmd.insert(-1, "-profile:v") ffmpeg_cmd.insert(-1, "high") elif "qsv" in encoder: ffmpeg_cmd.insert(-1, "-preset") ffmpeg_cmd.insert(-1, "medium") elif encoder == "libx264": ffmpeg_cmd.insert(-1, "-preset") ffmpeg_cmd.insert(-1, "medium") ffmpeg_cmd.insert(-1, "-crf") ffmpeg_cmd.insert(-1, "23") # 执行FFmpeg命令 # logger.info(f"裁剪视频片段: {timestamp} -> {ffmpeg_start_time}到{ffmpeg_end_time}") # logger.debug(f"执行命令: {' '.join(ffmpeg_cmd)}") # 在Windows系统上使用UTF-8编码处理输出,避免GBK编码错误 is_windows = os.name == 'nt' if is_windows: process = subprocess.run( ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', # 明确指定编码为UTF-8 text=True, check=False # 不抛出异常,我们会检查返回码 ) else: process = subprocess.run( ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False # 不抛出异常,我们会检查返回码 ) # 检查是否成功 if process.returncode != 0: logger.error(f"视频剪辑失败: {process.stderr}") if os.path.exists(video_path): os.remove(video_path) return '' # 验证生成的视频文件 if os.path.exists(video_path) and os.path.getsize(video_path) > 0: # 检查视频是否可播放 probe_cmd = ["ffprobe", "-v", "error", video_path] # 在Windows系统上使用UTF-8编码 if is_windows: validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') else: validate_result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if validate_result.returncode == 0: logger.info(f"视频剪辑成功: {video_path}") return video_path logger.error("视频文件验证失败") if os.path.exists(video_path): os.remove(video_path) return '' except Exception as e: logger.error(f"视频剪辑过程中发生错误: \n{str(traceback.format_exc())}") if os.path.exists(video_path): os.remove(video_path) return '' def clip_videos(task_id: str, timestamp_terms: List[str], origin_video: str, progress_callback=None) -> dict: """ 剪辑视频 Args: task_id: 任务id timestamp_terms: 需要剪辑的时间戳列表,如:['00:00:00,000-00:00:20,100', '00:00:43,039-00:00:46,959'] origin_video: 原视频路径 progress_callback: 进度回调函数 Returns: 剪辑后的视频路径 """ video_paths = {} total_items = len(timestamp_terms) for index, item in enumerate(timestamp_terms): material_directory = config.app.get("material_directory", "").strip() try: saved_video_path = save_clip_video(timestamp=item, origin_video=origin_video, save_dir=material_directory) if saved_video_path: video_paths.update({index+1:saved_video_path}) # 更新进度 if progress_callback: progress_callback(index + 1, total_items) except Exception as e: logger.error(f"视频裁剪失败: {utils.to_json(item)} =>\n{str(traceback.format_exc())}") return {} logger.success(f"裁剪 {len(video_paths)} videos") # logger.debug(json.dumps(video_paths, indent=4, ensure_ascii=False)) return video_paths def merge_videos(video_paths, ost_list): """ 合并多个视频为一个视频,可选择是否保留每个视频的原声。 :param video_paths: 视频文件路径列表 :param ost_list: 是否保留原声的布尔值列表 :return: 合并后的视频文件路径 """ if len(video_paths) != len(ost_list): raise ValueError("视频路径列表和保留原声列表长度必须相同") if not video_paths: raise ValueError("视频路径列表不能为空") # 准备临时文件列表 temp_file = "temp_file_list.txt" with open(temp_file, "w") as f: for video_path, keep_ost in zip(video_paths, ost_list): if keep_ost: f.write(f"file '{video_path}'\n") else: # 如果不保留原声,创建一个无声的临时视频 silent_video = f"silent_{os.path.basename(video_path)}" subprocess.run(["ffmpeg", "-i", video_path, "-c:v", "copy", "-an", silent_video], check=True) f.write(f"file '{silent_video}'\n") # 合并视频 output_file = "combined.mp4" ffmpeg_cmd = [ "ffmpeg", "-f", "concat", "-safe", "0", "-i", temp_file, "-c:v", "copy", "-c:a", "aac", "-strict", "experimental", output_file ] try: subprocess.run(ffmpeg_cmd, check=True) print(f"视频合并成功:{output_file}") except subprocess.CalledProcessError as e: print(f"视频合并失败:{e}") return None finally: # 清理临时文件 os.remove(temp_file) for video_path, keep_ost in zip(video_paths, ost_list): if not keep_ost: silent_video = f"silent_{os.path.basename(video_path)}" if os.path.exists(silent_video): os.remove(silent_video) return output_file ================================================ FILE: app/services/merger_video.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : merger_video @Author : Viccy同学 @Date : 2025/5/6 下午7:38 ''' import os import shutil import subprocess from enum import Enum from typing import List, Optional, Tuple from loguru import logger from app.utils import ffmpeg_utils class VideoAspect(Enum): """视频宽高比枚举""" landscape = "16:9" # 横屏 16:9 landscape_2 = "4:3" portrait = "9:16" # 竖屏 9:16 portrait_2 = "3:4" square = "1:1" # 方形 1:1 def to_resolution(self) -> Tuple[int, int]: """根据宽高比返回标准分辨率""" if self == VideoAspect.portrait: return 1080, 1920 # 竖屏 9:16 elif self == VideoAspect.portrait_2: return 720, 1280 # 竖屏 4:3 elif self == VideoAspect.landscape: return 1920, 1080 # 横屏 16:9 elif self == VideoAspect.landscape_2: return 1280, 720 # 横屏 4:3 elif self == VideoAspect.square: return 1080, 1080 # 方形 1:1 else: return 1080, 1920 # 默认竖屏 def check_ffmpeg_installation() -> bool: """ 检查ffmpeg是否已安装 Returns: bool: 如果安装则返回True,否则返回False """ try: subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return True except (subprocess.SubprocessError, FileNotFoundError): logger.error("ffmpeg未安装或不在系统PATH中,请安装ffmpeg") return False def get_hardware_acceleration_option() -> Optional[str]: """ 根据系统环境选择合适的硬件加速选项 Returns: Optional[str]: 硬件加速参数,如果不支持则返回None """ # 使用新的硬件加速检测API return ffmpeg_utils.get_ffmpeg_hwaccel_type() def check_video_has_audio(video_path: str) -> bool: """ 检查视频是否包含音频流 Args: video_path: 视频文件路径 Returns: bool: 如果视频包含音频流则返回True,否则返回False """ if not os.path.exists(video_path): logger.warning(f"视频文件不存在: {video_path}") return False probe_cmd = [ 'ffprobe', '-v', 'error', '-select_streams', 'a:0', '-show_entries', 'stream=codec_type', '-of', 'csv=p=0', video_path ] try: result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False) return result.stdout.strip() == 'audio' except Exception as e: logger.warning(f"检测视频音频流时出错: {str(e)}") return False def create_ffmpeg_concat_file(video_paths: List[str], concat_file_path: str) -> str: """ 创建ffmpeg合并所需的concat文件 Args: video_paths: 需要合并的视频文件路径列表 concat_file_path: concat文件的输出路径 Returns: str: concat文件的路径 """ with open(concat_file_path, 'w', encoding='utf-8') as f: for video_path in video_paths: # 获取绝对路径 abs_path = os.path.abspath(video_path) # 在Windows上将反斜杠替换为正斜杠 if os.name == 'nt': # Windows系统 abs_path = abs_path.replace('\\', '/') else: # Unix/Mac系统 # 转义特殊字符 abs_path = abs_path.replace('\\', '\\\\').replace(':', '\\:') # 处理路径中的单引号 (如果有) abs_path = abs_path.replace("'", "\\'") f.write(f"file '{abs_path}'\n") return concat_file_path def process_single_video( input_path: str, output_path: str, target_width: int, target_height: int, keep_audio: bool = True, hwaccel: Optional[str] = None ) -> str: """ 处理单个视频:调整分辨率、帧率等 重要修复:避免在视频滤镜处理时使用CUDA硬件解码, 因为这会导致滤镜链格式转换错误。使用纯NVENC编码器获得最佳兼容性。 Args: input_path: 输入视频路径 output_path: 输出视频路径 target_width: 目标宽度 target_height: 目标高度 keep_audio: 是否保留音频 hwaccel: 硬件加速选项 Returns: str: 处理后的视频路径 """ if not os.path.exists(input_path): raise FileNotFoundError(f"找不到视频文件: {input_path}") # 构建基本命令 command = ['ffmpeg', '-y'] # 安全检查:如果在Windows上,则慎用硬件加速 is_windows = os.name == 'nt' if is_windows and hwaccel: logger.info("在Windows系统上检测到硬件加速请求,将进行额外的兼容性检查") try: # 对视频进行快速探测,检测其基本信息 probe_cmd = [ 'ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=codec_name,width,height', '-of', 'csv=p=0', input_path ] result = subprocess.run(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False) # 如果探测成功,使用硬件加速;否则降级到软件编码 if result.returncode != 0: logger.warning(f"视频探测失败,为安全起见,禁用硬件加速: {result.stderr}") hwaccel = None except Exception as e: logger.warning(f"视频探测出错,禁用硬件加速: {str(e)}") hwaccel = None # 关键修复:对于涉及滤镜处理的场景,不使用CUDA硬件解码 # 这避免了 "Impossible to convert between the formats" 错误 # 我们将只使用纯NVENC编码器来获得硬件加速优势 # 输入文件(不添加硬件解码参数) command.extend(['-i', input_path]) # 处理音频 if not keep_audio: command.extend(['-an']) # 移除音频 else: # 检查输入视频是否有音频流 has_audio = check_video_has_audio(input_path) if has_audio: command.extend(['-c:a', 'aac', '-b:a', '128k']) # 音频编码为AAC else: logger.warning(f"视频 {input_path} 没有音频流,将会忽略音频设置") command.extend(['-an']) # 没有音频流时移除音频设置 # 视频处理参数:缩放并添加填充以保持比例 scale_filter = f"scale={target_width}:{target_height}:force_original_aspect_ratio=decrease" pad_filter = f"pad={target_width}:{target_height}:(ow-iw)/2:(oh-ih)/2" command.extend([ '-vf', f"{scale_filter},{pad_filter}", '-r', '30', # 设置帧率为30fps ]) # 关键修复:选择编码器时优先使用纯NVENC(无硬件解码) if hwaccel: try: # 检查是否为NVIDIA硬件加速 hwaccel_info = ffmpeg_utils.detect_hardware_acceleration() if hwaccel_info.get("type") in ["cuda", "nvenc"] and hwaccel_info.get("encoder") == "h264_nvenc": # 使用纯NVENC编码器(最佳兼容性) logger.info("使用纯NVENC编码器(避免滤镜链问题)") command.extend(['-c:v', 'h264_nvenc']) command.extend(['-preset', 'medium', '-cq', '23', '-profile:v', 'main']) else: # 其他硬件编码器 encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder() # logger.info(f"使用硬件编码器: {encoder}") command.extend(['-c:v', encoder]) # 根据编码器类型添加特定参数 if "amf" in encoder: command.extend(['-quality', 'balanced']) elif "qsv" in encoder: command.extend(['-preset', 'medium']) elif "videotoolbox" in encoder: command.extend(['-profile:v', 'high']) else: command.extend(['-preset', 'medium', '-profile:v', 'high']) except Exception as e: logger.warning(f"硬件编码器检测失败: {str(e)},将使用软件编码") hwaccel = None if not hwaccel: logger.info("使用软件编码器(libx264)") command.extend(['-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high']) # 设置视频比特率和其他参数 command.extend([ '-b:v', '5M', '-maxrate', '8M', '-bufsize', '10M', '-pix_fmt', 'yuv420p', # 兼容性更好的颜色格式 ]) # 输出文件 command.append(output_path) # 执行命令 try: # logger.info(f"执行FFmpeg命令: {' '.join(command)}") process = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # logger.info(f"视频处理成功: {output_path}") return output_path except subprocess.CalledProcessError as e: error_msg = e.stderr.decode() if e.stderr else str(e) logger.error(f"处理视频失败: {error_msg}") # 如果使用硬件加速失败,尝试使用软件编码 if hwaccel: logger.info("硬件加速失败,尝试使用软件编码作为备选方案") try: # 强制使用软件编码 ffmpeg_utils.force_software_encoding() # 构建新的命令,使用软件编码 fallback_cmd = ['ffmpeg', '-y', '-i', input_path] # 保持原有的音频设置 if not keep_audio: fallback_cmd.extend(['-an']) else: has_audio = check_video_has_audio(input_path) if has_audio: fallback_cmd.extend(['-c:a', 'aac', '-b:a', '128k']) else: fallback_cmd.extend(['-an']) # 保持原有的视频过滤器 fallback_cmd.extend([ '-vf', f"{scale_filter},{pad_filter}", '-r', '30', '-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high', '-b:v', '5M', '-maxrate', '8M', '-bufsize', '10M', '-pix_fmt', 'yuv420p', output_path ]) logger.info("执行软件编码备选方案") subprocess.run(fallback_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.info(f"使用软件编码成功处理视频: {output_path}") return output_path except subprocess.CalledProcessError as fallback_error: fallback_error_msg = fallback_error.stderr.decode() if fallback_error.stderr else str(fallback_error) logger.error(f"软件编码备选方案也失败: {fallback_error_msg}") # 尝试最基本的编码参数 try: logger.info("尝试最基本的编码参数") basic_cmd = [ 'ffmpeg', '-y', '-i', input_path, '-c:v', 'libx264', '-preset', 'ultrafast', '-crf', '23', '-pix_fmt', 'yuv420p', output_path ] subprocess.run(basic_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.info(f"使用基本编码参数成功处理视频: {output_path}") return output_path except subprocess.CalledProcessError as basic_error: basic_error_msg = basic_error.stderr.decode() if basic_error.stderr else str(basic_error) logger.error(f"基本编码参数也失败: {basic_error_msg}") raise RuntimeError(f"无法处理视频 {input_path}: 所有编码方案都失败") # 如果不是硬件加速导致的问题,或者备选方案也失败了,抛出原始错误 raise RuntimeError(f"处理视频失败: {error_msg}") def combine_clip_videos( output_video_path: str, video_paths: List[str], video_ost_list: List[int], video_aspect: VideoAspect = VideoAspect.portrait, threads: int = 4, force_software_encoding: bool = False, # 新参数,强制使用软件编码 ) -> str: """ 合并子视频 Args: output_video_path: 合并后的存储路径 video_paths: 子视频路径列表 video_ost_list: 原声播放列表 (0: 不保留原声, 1: 只保留原声, 2: 保留原声并保留解说) video_aspect: 屏幕比例 threads: 线程数 force_software_encoding: 是否强制使用软件编码(忽略硬件加速检测) Returns: str: 合并后的视频路径 """ # 检查ffmpeg是否安装 if not check_ffmpeg_installation(): raise RuntimeError("未找到ffmpeg,请先安装") # 准备输出目录 output_dir = os.path.dirname(output_video_path) os.makedirs(output_dir, exist_ok=True) # 获取目标分辨率 aspect = VideoAspect(video_aspect) video_width, video_height = aspect.to_resolution() # 检测可用的硬件加速选项 hwaccel = None if force_software_encoding else get_hardware_acceleration_option() if hwaccel: logger.info(f"将使用 {hwaccel} 硬件加速") elif force_software_encoding: logger.info("已强制使用软件编码,跳过硬件加速检测") else: logger.info("未检测到兼容的硬件加速,将使用软件编码") # Windows系统上,默认使用软件编码以提高兼容性 if os.name == 'nt' and hwaccel: logger.warning("在Windows系统上检测到硬件加速,但为了提高兼容性,建议使用软件编码") # 不强制禁用hwaccel,而是在process_single_video中进行额外安全检查 # 重组视频路径和原声设置为一个字典列表结构 video_segments = [] # 检查视频路径和原声设置列表长度是否匹配 if len(video_paths) != len(video_ost_list): logger.warning(f"视频路径列表({len(video_paths)})和原声设置列表({len(video_ost_list)})长度不匹配") # 调整长度以匹配较短的列表 min_length = min(len(video_paths), len(video_ost_list)) video_paths = video_paths[:min_length] video_ost_list = video_ost_list[:min_length] # 创建视频处理配置字典列表 for i, (video_path, video_ost) in enumerate(zip(video_paths, video_ost_list)): if not os.path.exists(video_path): logger.warning(f"视频不存在,跳过: {video_path}") continue # 检查是否有音频流 has_audio = check_video_has_audio(video_path) # 构建视频片段配置 segment = { "index": i, "path": video_path, "ost": video_ost, "has_audio": has_audio, "keep_audio": video_ost > 0 and has_audio # 只有当ost>0且实际有音频时才保留 } # 记录日志 if video_ost > 0 and not has_audio: logger.warning(f"视频 {video_path} 设置为保留原声(ost={video_ost}),但该视频没有音频流") video_segments.append(segment) # 处理每个视频片段 processed_videos = [] temp_dir = os.path.join(output_dir, "temp_videos") os.makedirs(temp_dir, exist_ok=True) try: # 第一阶段:处理所有视频片段到中间文件 for segment in video_segments: # 处理单个视频,去除或保留音频 temp_output = os.path.join(temp_dir, f"processed_{segment['index']}.mp4") try: process_single_video( input_path=segment['path'], output_path=temp_output, target_width=video_width, target_height=video_height, keep_audio=segment['keep_audio'], hwaccel=hwaccel ) processed_videos.append({ "index": segment["index"], "path": temp_output, "keep_audio": segment["keep_audio"] }) logger.info(f"视频 {segment['index'] + 1}/{len(video_segments)} 处理完成") except Exception as e: logger.error(f"处理视频 {segment['path']} 时出错: {str(e)}") # 如果使用硬件加速失败,尝试使用软件编码 if hwaccel and not force_software_encoding: logger.info(f"尝试使用软件编码处理视频 {segment['path']}") try: process_single_video( input_path=segment['path'], output_path=temp_output, target_width=video_width, target_height=video_height, keep_audio=segment['keep_audio'], hwaccel=None # 使用软件编码 ) processed_videos.append({ "index": segment["index"], "path": temp_output, "keep_audio": segment["keep_audio"] }) logger.info(f"使用软件编码成功处理视频 {segment['index'] + 1}/{len(video_segments)}") except Exception as fallback_error: logger.error(f"使用软件编码处理视频 {segment['path']} 也失败: {str(fallback_error)}") continue else: continue if not processed_videos: raise ValueError("没有有效的视频片段可以合并") # 按原始索引排序处理后的视频 processed_videos.sort(key=lambda x: x["index"]) # 第二阶段:分步骤合并视频 - 避免复杂的filter_complex滤镜 try: # 1. 首先,将所有没有音频的视频或音频被禁用的视频合并到一个临时文件中 video_paths_only = [video["path"] for video in processed_videos] video_concat_path = os.path.join(temp_dir, "video_concat.mp4") # 创建concat文件,用于合并视频流 concat_file = os.path.join(temp_dir, "concat_list.txt") create_ffmpeg_concat_file(video_paths_only, concat_file) # 合并所有视频流,但不包含音频 concat_cmd = [ 'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', concat_file, '-c:v', 'libx264', '-preset', 'medium', '-profile:v', 'high', '-an', # 不包含音频 '-threads', str(threads), video_concat_path ] subprocess.run(concat_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.info("视频流合并完成") # 2. 提取并合并有音频的片段 audio_segments = [video for video in processed_videos if video["keep_audio"]] if not audio_segments: # 如果没有音频片段,直接使用无音频的合并视频作为最终结果 shutil.copy(video_concat_path, output_video_path) logger.info("无音频视频合并完成") return output_video_path # 创建音频中间文件 audio_files = [] for i, segment in enumerate(audio_segments): # 提取音频 audio_file = os.path.join(temp_dir, f"audio_{i}.aac") extract_audio_cmd = [ 'ffmpeg', '-y', '-i', segment["path"], '-vn', # 不包含视频 '-c:a', 'aac', '-b:a', '128k', audio_file ] subprocess.run(extract_audio_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) audio_files.append({ "index": segment["index"], "path": audio_file }) logger.info(f"提取音频 {i+1}/{len(audio_segments)} 完成") # 3. 计算每个音频片段的时间位置 audio_timings = [] current_time = 0.0 # 获取每个视频片段的时长 for i, video in enumerate(processed_videos): duration_cmd = [ 'ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'csv=p=0', video["path"] ] result = subprocess.run(duration_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) duration = float(result.stdout.strip()) # 如果当前片段需要保留音频,记录时间位置 if video["keep_audio"]: for audio in audio_files: if audio["index"] == video["index"]: audio_timings.append({ "file": audio["path"], "start": current_time, "index": video["index"] }) break current_time += duration # 4. 创建静音音频轨道作为基础 silence_audio = os.path.join(temp_dir, "silence.aac") create_silence_cmd = [ 'ffmpeg', '-y', '-f', 'lavfi', '-i', f'anullsrc=r=44100:cl=stereo', '-t', str(current_time), # 总时长 '-c:a', 'aac', '-b:a', '128k', silence_audio ] subprocess.run(create_silence_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # 5. 创建复杂滤镜命令以混合音频 filter_script = os.path.join(temp_dir, "filter_script.txt") with open(filter_script, 'w') as f: f.write(f"[0:a]volume=0.0[silence];\n") # 首先静音背景轨道 # 添加每个音频文件,并补偿amix的音量稀释 # amix会将n个输入的音量平均分配,所以我们需要将每个输入的音量提高n倍来保持原始音量 num_inputs = len(audio_timings) + 1 # +1 for silence track volume_compensation = num_inputs # 补偿系数 for i, timing in enumerate(audio_timings): # 为每个音频添加音量补偿,确保原声保持原始音量 f.write(f"[{i+1}:a]volume={volume_compensation},adelay={int(timing['start']*1000)}|{int(timing['start']*1000)}[a{i}];\n") # 混合所有音频 mix_str = "[silence]" for i in range(len(audio_timings)): mix_str += f"[a{i}]" mix_str += f"amix=inputs={len(audio_timings)+1}:duration=longest[aout]" f.write(mix_str) # 6. 构建音频合并命令 audio_inputs = ['-i', silence_audio] for timing in audio_timings: audio_inputs.extend(['-i', timing["file"]]) mixed_audio = os.path.join(temp_dir, "mixed_audio.aac") audio_mix_cmd = [ 'ffmpeg', '-y' ] + audio_inputs + [ '-filter_complex_script', filter_script, '-map', '[aout]', '-c:a', 'aac', '-b:a', '128k', mixed_audio ] subprocess.run(audio_mix_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.info("音频混合完成") # 7. 将合并的视频和混合的音频组合在一起 final_cmd = [ 'ffmpeg', '-y', '-i', video_concat_path, '-i', mixed_audio, '-c:v', 'copy', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-shortest', output_video_path ] subprocess.run(final_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.info("视频最终合并完成") return output_video_path except subprocess.CalledProcessError as e: logger.error(f"合并视频过程中出错: {e.stderr.decode() if e.stderr else str(e)}") # 尝试备用合并方法 - 最简单的无音频合并 logger.info("尝试备用合并方法 - 无音频合并") try: concat_file = os.path.join(temp_dir, "concat_list.txt") video_paths_only = [video["path"] for video in processed_videos] create_ffmpeg_concat_file(video_paths_only, concat_file) backup_cmd = [ 'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', concat_file, '-c:v', 'copy', '-an', # 无音频 output_video_path ] subprocess.run(backup_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) logger.warning("使用备用方法(无音频)成功合并视频") return output_video_path except Exception as backup_error: logger.error(f"备用合并方法也失败: {str(backup_error)}") raise RuntimeError(f"无法合并视频: {str(backup_error)}") except Exception as e: logger.error(f"合并视频时出错: {str(e)}") raise finally: # 清理临时文件 try: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) logger.info("已清理临时文件") except Exception as e: logger.warning(f"清理临时文件时出错: {str(e)}") if __name__ == '__main__': video_paths = [ '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E02_00_14_09_440.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_27_11_110.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_34_44_480.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E08_00_42_47_630.mp4', '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/S01E09_00_29_48_160.mp4' ] combine_clip_videos( output_video_path="/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/merged_123.mp4", video_paths=video_paths, video_ost_list=[1, 1, 1,1,1], video_aspect=VideoAspect.portrait, force_software_encoding=False # 默认不强制使用软件编码,让系统自动决定 ) ================================================ FILE: app/services/prompts/__init__.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : __init__.py @Author : viccy同学 @Date : 2025/1/7 @Description: 统一提示词管理模块 """ from .manager import PromptManager from .base import BasePrompt, VisionPrompt, TextPrompt, ParameterizedPrompt from .registry import PromptRegistry from .template import TemplateRenderer from .validators import PromptOutputValidator from .exceptions import ( PromptError, PromptNotFoundError, PromptValidationError, TemplateRenderError ) # 版本信息 __version__ = "1.0.0" __author__ = "viccy同学" # 导出的公共接口 __all__ = [ # 核心管理器 "PromptManager", # 基础类 "BasePrompt", "VisionPrompt", "TextPrompt", "ParameterizedPrompt", # 工具类 "PromptRegistry", "TemplateRenderer", "PromptOutputValidator", # 异常类 "PromptError", "PromptNotFoundError", "PromptValidationError", "TemplateRenderError", # 版本信息 "__version__", "__author__" ] # 模块初始化 def initialize_prompts(): """初始化提示词模块,注册所有提示词""" from . import documentary from . import short_drama_editing from . import short_drama_narration # 注册各模块的提示词 documentary.register_prompts() short_drama_editing.register_prompts() short_drama_narration.register_prompts() # 自动初始化 initialize_prompts() ================================================ FILE: app/services/prompts/base.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : base.py @Author : viccy同学 @Date : 2025/1/7 @Description: 提示词基础类定义 """ from abc import ABC, abstractmethod from typing import Dict, Any, Optional, List from enum import Enum from dataclasses import dataclass, field from datetime import datetime class ModelType(Enum): """模型类型枚举""" TEXT = "text" # 文本模型 VISION = "vision" # 视觉模型 MULTIMODAL = "multimodal" # 多模态模型 class OutputFormat(Enum): """输出格式枚举""" TEXT = "text" # 纯文本 JSON = "json" # JSON格式 MARKDOWN = "markdown" # Markdown格式 STRUCTURED = "structured" # 结构化数据 @dataclass class PromptMetadata: """提示词元数据""" name: str # 提示词名称 category: str # 分类 version: str # 版本 description: str # 描述 model_type: ModelType # 适用的模型类型 output_format: OutputFormat # 输出格式 author: str = "viccy同学" # 作者 created_at: datetime = field(default_factory=datetime.now) # 创建时间 updated_at: datetime = field(default_factory=datetime.now) # 更新时间 tags: List[str] = field(default_factory=list) # 标签 parameters: List[str] = field(default_factory=list) # 支持的参数列表 class BasePrompt(ABC): """提示词基础类""" def __init__(self, metadata: PromptMetadata): self.metadata = metadata self._template = None self._system_prompt = None self._examples = [] @property def name(self) -> str: """获取提示词名称""" return self.metadata.name @property def category(self) -> str: """获取提示词分类""" return self.metadata.category @property def version(self) -> str: """获取提示词版本""" return self.metadata.version @property def model_type(self) -> ModelType: """获取适用的模型类型""" return self.metadata.model_type @property def output_format(self) -> OutputFormat: """获取输出格式""" return self.metadata.output_format @abstractmethod def get_template(self) -> str: """获取提示词模板""" pass def get_system_prompt(self) -> Optional[str]: """获取系统提示词""" return self._system_prompt def get_examples(self) -> List[str]: """获取示例""" return self._examples.copy() def validate_parameters(self, parameters: Dict[str, Any]) -> bool: """验证参数""" required_params = set(self.metadata.parameters) provided_params = set(parameters.keys()) missing_params = required_params - provided_params if missing_params: from .exceptions import TemplateRenderError raise TemplateRenderError( template_name=self.name, error_message="缺少必需参数", missing_params=list(missing_params) ) return True def render(self, parameters: Dict[str, Any] = None) -> str: """渲染提示词""" parameters = parameters or {} # 验证参数 if self.metadata.parameters: self.validate_parameters(parameters) # 渲染模板 - 使用自定义的模板渲染器 template = self.get_template() try: from .template import get_renderer renderer = get_renderer() return renderer.render(template, parameters) except Exception as e: from .exceptions import TemplateRenderError raise TemplateRenderError( template_name=self.name, error_message=f"模板渲染错误: {str(e)}", missing_params=[] ) def to_dict(self) -> Dict[str, Any]: """转换为字典""" return { "metadata": { "name": self.metadata.name, "category": self.metadata.category, "version": self.metadata.version, "description": self.metadata.description, "model_type": self.metadata.model_type.value, "output_format": self.metadata.output_format.value, "author": self.metadata.author, "created_at": self.metadata.created_at.isoformat(), "updated_at": self.metadata.updated_at.isoformat(), "tags": self.metadata.tags, "parameters": self.metadata.parameters }, "template": self.get_template(), "system_prompt": self.get_system_prompt(), "examples": self.get_examples() } class TextPrompt(BasePrompt): """文本模型专用提示词""" def __init__(self, metadata: PromptMetadata): if metadata.model_type not in [ModelType.TEXT, ModelType.MULTIMODAL]: raise ValueError(f"TextPrompt只支持TEXT或MULTIMODAL模型类型,当前: {metadata.model_type}") super().__init__(metadata) class VisionPrompt(BasePrompt): """视觉模型专用提示词""" def __init__(self, metadata: PromptMetadata): if metadata.model_type not in [ModelType.VISION, ModelType.MULTIMODAL]: raise ValueError(f"VisionPrompt只支持VISION或MULTIMODAL模型类型,当前: {metadata.model_type}") super().__init__(metadata) class ParameterizedPrompt(BasePrompt): """支持参数化的提示词""" def __init__(self, metadata: PromptMetadata, required_parameters: List[str] = None): super().__init__(metadata) if required_parameters: self.metadata.parameters.extend(required_parameters) # 去重 self.metadata.parameters = list(set(self.metadata.parameters)) ================================================ FILE: app/services/prompts/documentary/__init__.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : __init__.py @Author : viccy同学 @Date : 2025/1/7 @Description: 纪录片解说提示词模块 """ from .frame_analysis import FrameAnalysisPrompt from .narration_generation import NarrationGenerationPrompt from ..manager import PromptManager def register_prompts(): """注册纪录片解说相关的提示词""" # 注册视频帧分析提示词 frame_analysis_prompt = FrameAnalysisPrompt() PromptManager.register_prompt(frame_analysis_prompt, is_default=True) # 注册解说文案生成提示词 narration_prompt = NarrationGenerationPrompt() PromptManager.register_prompt(narration_prompt, is_default=True) __all__ = [ "FrameAnalysisPrompt", "NarrationGenerationPrompt", "register_prompts" ] ================================================ FILE: app/services/prompts/documentary/frame_analysis.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 逐帧解说-画面分析 @File : frame_analysis.py @Author : viccy同学 @Date : 2025/1/7 @Description: 纪录片视频帧分析提示词 """ from ..base import VisionPrompt, PromptMetadata, ModelType, OutputFormat class FrameAnalysisPrompt(VisionPrompt): """纪录片视频帧分析提示词""" def __init__(self): metadata = PromptMetadata( name="frame_analysis", category="documentary", version="v1.0", description="分析纪录片视频关键帧,提取画面内容和场景描述", model_type=ModelType.VISION, output_format=OutputFormat.JSON, tags=["纪录片", "视频分析", "关键帧", "画面描述"], parameters=["video_theme", "custom_instructions"] ) super().__init__(metadata) self._system_prompt = "你是一名专业的视频内容分析师,擅长分析纪录片视频帧内容,提取关键信息和场景描述。" def get_template(self) -> str: return """请仔细分析这些视频关键帧图片,我需要你提供详细的画面分析。 视频主题:${video_theme} 分析要求: 1. 按时间顺序分析每一帧画面 2. 详细描述画面中的主要内容、人物、物体、环境 3. 注意画面的构图、色彩、光线等视觉元素 4. 识别画面中的关键动作或变化 5. 提供准确的时间戳信息 ${custom_instructions} 请按照以下JSON格式输出分析结果: { "analysis": [ { "timestamp": "00:00:05,390", "picture": "详细的画面描述,包括场景、人物、物体、动作等", "scene_type": "场景类型(如:建造、准备、完成等)", "key_elements": ["关键元素1", "关键元素2"], "visual_quality": "画面质量描述(构图、光线、色彩等)" } ], "summary": "整体视频内容概述", "total_frames": "分析的帧数" } 重要要求: 1. 只输出JSON格式,不要添加任何其他文字或代码块标记 2. 画面描述要详细准确,为后续解说文案生成提供充分信息 3. 时间戳必须准确对应视频帧 4. 严禁虚构不存在的内容""" ================================================ FILE: app/services/prompts/documentary/narration_generation.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 逐帧解说-文案生成 @File : narration_generation.py @Author : viccy同学 @Date : 2025/1/7 @Description: 通用短视频解说文案生成提示词(优化版v2.0) """ from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat class NarrationGenerationPrompt(TextPrompt): """通用短视频解说文案生成提示词""" def __init__(self): metadata = PromptMetadata( name="narration_generation", category="documentary", version="v2.0", description="根据视频帧分析结果生成病毒式传播短视频解说文案,适用于各类题材内容", model_type=ModelType.TEXT, output_format=OutputFormat.JSON, tags=["短视频", "解说文案", "病毒传播", "文案生成", "通用模板"], parameters=["video_frame_description"] ) super().__init__(metadata) self._system_prompt = "你是一名资深的短视频解说导演和编剧,深谙病毒式传播规律和用户心理,擅长创作让人停不下来的高粘性解说内容。" def get_template(self) -> str: return """作为一名短视频解说导演,你需要深入理解病毒式传播的核心规律。以下是爆款短视频解说的核心技巧: ## 黄金三秒法则 开头 3 秒决定用户是否继续观看,必须立即抓住注意力。 ## 十大爆款开头钩子类型: 1. **悬念式**:"你绝对想不到接下来会发生什么..." 2. **反转式**:"所有人都以为...但真相却是..." 3. **数字冲击**:"仅用 3 步/5 分钟/1 个技巧..." 4. **痛点切入**:"还在为...发愁吗?" 5. **惊叹式**:"太震撼了!这才是..." 6. **疑问引导**:"为什么...?答案让人意外" 7. **对比冲突**:"新手 VS 高手,差距竟然这么大" 8. **秘密揭露**:"内行人才知道的..." 9. **情感共鸣**:"有多少人和我一样..." 10. **颠覆认知**:"原来我们一直都错了..." ## 解说文案核心要素: - **节奏感**:短句为主,控制在 15-20 字/句,朗朗上口 - **画面感**:用具体动作和细节描述,避免抽象概念 - **情绪起伏**:制造期待、惊喜、满足的情绪曲线 - **信息密度**:每 5-10 秒一个信息点,保持新鲜感 - **口语化**:像朋友聊天,避免书面语和专业术语 - **留白艺术**:关键时刻停顿,让画面说话 ## 结构范式: 【开头】钩子引入(0-3秒)→ 【发展】情节推进(3-30秒)→ 【高潮】惊艳时刻(30-45秒)→ 【收尾】强化记忆/引导互动(45-60秒) ${video_frame_description} 现在,请基于 中的视频内容,创作一段符合病毒式传播规律的解说文案。 **创作步骤:** 1. 分析视频主题和核心亮点 2. 选择最适合的开头钩子类型 3. 提炼每个画面的最吸引人的细节 4. 设计情绪曲线和节奏变化 5. 确保解说与画面高度同步 **必须遵循的创作原则:** - 开头 3 秒必须使用钩子技巧,立即抓住注意力 - 每句话控制在 15-20 字,确保节奏明快 - 用动词和具体细节描述,增强画面感 - 制造悬念和期待,让用户想看到最后 - 在关键视觉高潮处,适当留白让画面说话 - 结尾呼应开头,强化记忆点或引导互动 请使用以下 JSON 格式输出: { "items": [ { "_id": 1, "timestamp": "00:00:05,390-00:00:10,430", "picture": "画面描述", "narration": "解说文案" } ] } 1. 只输出 JSON 内容,不要输出其他任何说明性文字 2. 解说文案的语言使用简体中文 3. 严禁虚构画面,所有画面描述只能从 中提取 4. 严禁虚构时间戳,所有时间戳只能从 中提取 5. 开头必须使用钩子技巧,遵循黄金三秒法则 6. 每个片段的解说文案要与画面内容精准匹配 7. 保持解说的连贯性、故事性和节奏感 8. 控制单句长度在 15-20 字,确保口语化表达 9. 在视觉高潮处适当精简文案,让画面自己说话 10. 整体风格要符合当前主流短视频平台的受欢迎特征 """ ================================================ FILE: app/services/prompts/exceptions.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : exceptions.py @Author : viccy同学 @Date : 2025/1/7 @Description: 提示词管理模块异常定义 """ class PromptError(Exception): """提示词模块基础异常类""" pass class PromptNotFoundError(PromptError): """提示词未找到异常""" def __init__(self, category: str, name: str, version: str = None): self.category = category self.name = name self.version = version if version: message = f"提示词未找到: {category}.{name} (版本: {version})" else: message = f"提示词未找到: {category}.{name}" super().__init__(message) class PromptValidationError(PromptError): """提示词验证异常""" def __init__(self, message: str, validation_errors: list = None): self.validation_errors = validation_errors or [] super().__init__(message) class TemplateRenderError(PromptError): """模板渲染异常""" def __init__(self, template_name: str, error_message: str, missing_params: list = None): self.template_name = template_name self.error_message = error_message self.missing_params = missing_params or [] message = f"模板渲染失败 '{template_name}': {error_message}" if missing_params: message += f" (缺少参数: {', '.join(missing_params)})" super().__init__(message) class PromptRegistrationError(PromptError): """提示词注册异常""" def __init__(self, category: str, name: str, reason: str): self.category = category self.name = name self.reason = reason message = f"提示词注册失败 {category}.{name}: {reason}" super().__init__(message) class PromptVersionError(PromptError): """提示词版本异常""" def __init__(self, category: str, name: str, version: str, reason: str): self.category = category self.name = name self.version = version self.reason = reason message = f"提示词版本错误 {category}.{name} v{version}: {reason}" super().__init__(message) ================================================ FILE: app/services/prompts/manager.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : manager.py @Author : viccy同学 @Date : 2025/1/7 @Description: 提示词管理器 """ from typing import Dict, Any, List, Optional, Union from loguru import logger from .base import BasePrompt, ModelType, OutputFormat from .registry import get_registry from .template import get_renderer from .validators import PromptOutputValidator from .exceptions import ( PromptNotFoundError, PromptValidationError, TemplateRenderError ) class PromptManager: """提示词管理器 - 统一的提示词管理接口""" def __init__(self): self._registry = get_registry() self._renderer = get_renderer() @classmethod def get_prompt(cls, category: str, name: str, version: Optional[str] = None, parameters: Optional[Dict[str, Any]] = None) -> str: """ 获取渲染后的提示词 Args: category: 分类 name: 名称 version: 版本(可选,默认使用最新版本) parameters: 模板参数(可选) Returns: 渲染后的提示词字符串 """ instance = cls() prompt_obj = instance._registry.get(category, name, version) try: rendered = prompt_obj.render(parameters) logger.debug(f"提示词渲染成功: {category}.{name}-{prompt_obj.version}") return rendered except Exception as e: logger.error(f"提示词渲染失败: {category}.{name} - {str(e)}") raise @classmethod def get_prompt_object(cls, category: str, name: str, version: Optional[str] = None) -> BasePrompt: """ 获取提示词对象 Args: category: 分类 name: 名称 version: 版本(可选) Returns: 提示词对象 """ instance = cls() return instance._registry.get(category, name, version) @classmethod def register_prompt(cls, prompt: BasePrompt, is_default: bool = True) -> None: """ 注册提示词 Args: prompt: 提示词对象 is_default: 是否设为默认版本 """ instance = cls() instance._registry.register(prompt, is_default) @classmethod def list_categories(cls) -> List[str]: """列出所有分类""" instance = cls() return instance._registry.list_categories() @classmethod def list_prompts(cls, category: str) -> List[str]: """列出指定分类下的所有提示词""" instance = cls() return instance._registry.list_prompts(category) @classmethod def list_versions(cls, category: str, name: str) -> List[str]: """列出指定提示词的所有版本""" instance = cls() return instance._registry.list_versions(category, name) @classmethod def exists(cls, category: str, name: str, version: Optional[str] = None) -> bool: """检查提示词是否存在""" instance = cls() return instance._registry.exists(category, name, version) @classmethod def search_prompts(cls, keyword: str = None, category: str = None, model_type: ModelType = None, output_format: OutputFormat = None) -> List[Dict[str, str]]: """ 搜索提示词 Args: keyword: 关键词 category: 分类过滤 model_type: 模型类型过滤 output_format: 输出格式过滤 Returns: 匹配的提示词列表 """ instance = cls() results = instance._registry.search(keyword, category, model_type, output_format) return [ { "category": cat, "name": name, "version": ver, "full_name": f"{cat}.{name}", "identifier": f"{cat}.{name}@{ver}" } for cat, name, ver in results ] @classmethod def get_stats(cls) -> Dict[str, Any]: """获取统计信息""" instance = cls() registry_stats = instance._registry.get_stats() return { "registry": registry_stats, "categories": cls.list_categories(), "total_categories": registry_stats["categories"], "total_prompts": registry_stats["prompts"], "total_versions": registry_stats["versions"] } @classmethod def validate_output(cls, output: Union[str, Dict], category: str, name: str, version: Optional[str] = None) -> Any: """ 验证提示词输出 Args: output: 输出内容 category: 提示词分类 name: 提示词名称 version: 提示词版本 Returns: 验证后的数据 """ instance = cls() prompt_obj = instance._registry.get(category, name, version) # 根据输出格式进行验证 output_format = prompt_obj.metadata.output_format try: if output_format == OutputFormat.JSON: # 特殊处理解说文案和剧情分析 if "narration" in name.lower() or "script" in name.lower(): return PromptOutputValidator.validate_narration_script(output) elif "plot" in name.lower() or "analysis" in name.lower(): return PromptOutputValidator.validate_plot_analysis(output) else: return PromptOutputValidator.validate_json(output) else: return PromptOutputValidator.validate_by_format(output, output_format) except Exception as e: logger.error(f"输出验证失败 {category}.{name}: {str(e)}") raise PromptValidationError(f"输出验证失败: {str(e)}") @classmethod def get_prompt_info(cls, category: str, name: str, version: Optional[str] = None) -> Dict[str, Any]: """ 获取提示词详细信息 Args: category: 分类 name: 名称 version: 版本 Returns: 提示词详细信息 """ instance = cls() prompt_obj = instance._registry.get(category, name, version) return { "metadata": { "name": prompt_obj.metadata.name, "category": prompt_obj.metadata.category, "version": prompt_obj.metadata.version, "description": prompt_obj.metadata.description, "model_type": prompt_obj.metadata.model_type.value, "output_format": prompt_obj.metadata.output_format.value, "author": prompt_obj.metadata.author, "created_at": prompt_obj.metadata.created_at.isoformat(), "updated_at": prompt_obj.metadata.updated_at.isoformat(), "tags": prompt_obj.metadata.tags, "parameters": prompt_obj.metadata.parameters }, "template_preview": prompt_obj.get_template()[:500] + "..." if len(prompt_obj.get_template()) > 500 else prompt_obj.get_template(), "system_prompt": prompt_obj.get_system_prompt(), "examples_count": len(prompt_obj.get_examples()), "has_parameters": bool(prompt_obj.metadata.parameters) } @classmethod def export_prompts(cls, category: Optional[str] = None) -> Dict[str, Any]: """ 导出提示词配置 Args: category: 分类过滤(可选) Returns: 提示词配置数据 """ instance = cls() categories = [category] if category else instance._registry.list_categories() export_data = { "version": "1.0.0", "exported_at": instance._get_current_time(), "categories": {} } for cat in categories: export_data["categories"][cat] = {} prompts = instance._registry.list_prompts(cat) for prompt_name in prompts: versions = instance._registry.list_versions(cat, prompt_name) export_data["categories"][cat][prompt_name] = {} for ver in versions: prompt_obj = instance._registry.get(cat, prompt_name, ver) export_data["categories"][cat][prompt_name][ver] = prompt_obj.to_dict() return export_data def _get_current_time(self) -> str: """获取当前时间字符串""" from datetime import datetime return datetime.now().isoformat() # 便捷函数 def get_prompt(category: str, name: str, version: str = None, **parameters) -> str: """获取提示词的便捷函数""" return PromptManager.get_prompt(category, name, version, parameters) def validate_prompt_output(output: Union[str, Dict], category: str, name: str, version: str = None) -> Any: """验证提示词输出的便捷函数""" return PromptManager.validate_output(output, category, name, version) ================================================ FILE: app/services/prompts/registry.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : registry.py @Author : viccy同学 @Date : 2025/1/7 @Description: 提示词注册机制 """ from typing import Dict, List, Optional, Tuple from collections import defaultdict from loguru import logger from .base import BasePrompt, ModelType, OutputFormat from .exceptions import ( PromptNotFoundError, PromptRegistrationError, PromptVersionError ) class PromptRegistry: """提示词注册表""" def __init__(self): # 存储结构: {category: {name: {version: prompt}}} self._prompts: Dict[str, Dict[str, Dict[str, BasePrompt]]] = defaultdict( lambda: defaultdict(dict) ) # 默认版本映射: {category: {name: default_version}} self._default_versions: Dict[str, Dict[str, str]] = defaultdict(dict) def register(self, prompt: BasePrompt, is_default: bool = True) -> None: """ 注册提示词 Args: prompt: 提示词实例 is_default: 是否设为默认版本 """ category = prompt.category name = prompt.name version = prompt.version # 检查是否已存在相同版本 if version in self._prompts[category][name]: raise PromptRegistrationError( category=category, name=name, reason=f"版本 {version} 已存在" ) # 注册提示词 self._prompts[category][name][version] = prompt # 设置默认版本 if is_default or name not in self._default_versions[category]: self._default_versions[category][name] = version # 降级为 debug 日志,避免启动时的噪音 logger.debug(f"已注册提示词: {category}.{name} v{version}") def get(self, category: str, name: str, version: Optional[str] = None) -> BasePrompt: """ 获取提示词 Args: category: 分类 name: 名称 version: 版本,为None时使用默认版本 Returns: 提示词实例 """ if category not in self._prompts: raise PromptNotFoundError(category, name, version) if name not in self._prompts[category]: raise PromptNotFoundError(category, name, version) # 确定版本 if version is None: if name not in self._default_versions[category]: raise PromptNotFoundError(category, name, version) version = self._default_versions[category][name] if version not in self._prompts[category][name]: raise PromptNotFoundError(category, name, version) return self._prompts[category][name][version] def list_categories(self) -> List[str]: """列出所有分类""" return list(self._prompts.keys()) def list_prompts(self, category: str) -> List[str]: """列出指定分类下的所有提示词名称""" if category not in self._prompts: return [] return list(self._prompts[category].keys()) def list_versions(self, category: str, name: str) -> List[str]: """列出指定提示词的所有版本""" if category not in self._prompts or name not in self._prompts[category]: return [] return list(self._prompts[category][name].keys()) def get_default_version(self, category: str, name: str) -> Optional[str]: """获取默认版本""" return self._default_versions.get(category, {}).get(name) def set_default_version(self, category: str, name: str, version: str) -> None: """设置默认版本""" if (category not in self._prompts or name not in self._prompts[category] or version not in self._prompts[category][name]): raise PromptVersionError(category, name, version, "版本不存在") self._default_versions[category][name] = version logger.info(f"已设置默认版本: {category}.{name} -> v{version}") def exists(self, category: str, name: str, version: Optional[str] = None) -> bool: """检查提示词是否存在""" try: self.get(category, name, version) return True except PromptNotFoundError: return False def remove(self, category: str, name: str, version: Optional[str] = None) -> None: """移除提示词""" if version is None: # 移除所有版本 if category in self._prompts and name in self._prompts[category]: del self._prompts[category][name] if name in self._default_versions.get(category, {}): del self._default_versions[category][name] logger.info(f"已移除提示词所有版本: {category}.{name}") else: # 移除指定版本 if (category in self._prompts and name in self._prompts[category] and version in self._prompts[category][name]): del self._prompts[category][name][version] # 如果移除的是默认版本,需要重新设置默认版本 if (self._default_versions.get(category, {}).get(name) == version and self._prompts[category][name]): # 选择最新版本作为默认版本 new_default = max(self._prompts[category][name].keys()) self._default_versions[category][name] = new_default logger.info(f"默认版本已更新: {category}.{name} -> v{new_default}") logger.info(f"已移除提示词版本: {category}.{name} v{version}") def search(self, keyword: str = None, category: str = None, model_type: ModelType = None, output_format: OutputFormat = None) -> List[Tuple[str, str, str]]: """ 搜索提示词 Args: keyword: 关键词(在名称和描述中搜索) category: 分类过滤 model_type: 模型类型过滤 output_format: 输出格式过滤 Returns: 匹配的提示词列表 [(category, name, version), ...] """ results = [] categories = [category] if category else self._prompts.keys() for cat in categories: for name in self._prompts[cat]: for version, prompt in self._prompts[cat][name].items(): # 关键词过滤 if keyword: if (keyword.lower() not in name.lower() and keyword.lower() not in prompt.metadata.description.lower()): continue # 模型类型过滤 if model_type and prompt.metadata.model_type != model_type: continue # 输出格式过滤 if output_format and prompt.metadata.output_format != output_format: continue results.append((cat, name, version)) return results def get_stats(self) -> Dict[str, int]: """获取注册表统计信息""" total_prompts = 0 total_versions = 0 for category in self._prompts: for name in self._prompts[category]: total_prompts += 1 total_versions += len(self._prompts[category][name]) return { "categories": len(self._prompts), "prompts": total_prompts, "versions": total_versions } # 全局注册表实例 _global_registry = PromptRegistry() def get_registry() -> PromptRegistry: """获取全局注册表实例""" return _global_registry ================================================ FILE: app/services/prompts/short_drama_editing/__init__.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : __init__.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧混剪提示词模块 """ from .subtitle_analysis import SubtitleAnalysisPrompt from .plot_extraction import PlotExtractionPrompt from ..manager import PromptManager def register_prompts(): """注册短剧混剪相关的提示词""" # 注册字幕分析提示词 subtitle_analysis_prompt = SubtitleAnalysisPrompt() PromptManager.register_prompt(subtitle_analysis_prompt, is_default=True) # 注册爆点提取提示词 plot_extraction_prompt = PlotExtractionPrompt() PromptManager.register_prompt(plot_extraction_prompt, is_default=True) __all__ = [ "SubtitleAnalysisPrompt", "PlotExtractionPrompt", "register_prompts" ] ================================================ FILE: app/services/prompts/short_drama_editing/plot_extraction.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 短剧混剪-画面匹配 @File : plot_extraction.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧爆点提取提示词 - 优化版本v2.0 """ from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat class PlotExtractionPrompt(TextPrompt): """短剧爆点提取提示词 - 优化版本""" def __init__(self): metadata = PromptMetadata( name="plot_extraction", category="short_drama_editing", version="v2.0", description="根据剧情梗概和字幕内容,精确定位关键剧情的时间段,确保片段连贯可剪辑", model_type=ModelType.TEXT, output_format=OutputFormat.JSON, tags=["短剧", "爆点定位", "时间戳", "剧情提取", "连贯性", "过渡片段"], parameters=["subtitle_content", "plot_summary", "plot_titles"] ) super().__init__(metadata) self._system_prompt = "你是一名专业短剧剪辑师,精通视频叙事节奏和剧情连贯性,擅长选择能够流畅衔接的视频片段。" def get_template(self) -> str: return """# 短剧混剪时间段定位任务 ## 任务目标 为每个关键情节点精确定位时间段,确保最终剪辑出的视频**剧情连贯、逻辑清晰**。 ## 输入信息 ### 剧情梗概 ${plot_summary} ### 需要定位的情节点(按剧情发展顺序) ${plot_titles} ### 完整字幕内容 ${subtitle_content} ## 时间段选择原则 ### 1. 连贯性原则(最重要) **目标:观众看完剪辑后能理解完整故事** - **包含必要上下文**:每个片段要包含理解该情节所需的前置信息 - **自然的开始点**:片段开头应该是一个自然的场景切入点 - **完整的结束点**:片段结尾应该是一个自然的收尾或转折点 - **衔接考虑**:考虑与前后片段的衔接是否流畅 ### 2. 时间段技术规范(绝对不能违反) **时间戳规则**: - 格式必须为:`xx:xx:xx,xxx-xx:xx:xx,xxx` - 必须与字幕中的时间戳精确匹配 - **严禁时间段重叠**:任意两个片段的时间不能有交集 - **严格按时间顺序**:后一个片段的开始时间必须大于前一个片段的结束时间 **时长控制**: - 单个片段建议时长:10-60秒 - 过短(<5秒):信息不完整,观众无法理解 - 过长(>90秒):节奏拖沓,失去混剪意义 ### 3. 内容完整性原则 每个片段必须包含: - **情节的起因**:为什么会发生这件事 - **情节的经过**:具体发生了什么 - **情节的结果/转折**:导致了什么后果 ### 4. 过渡片段策略 如果两个关键情节之间跳跃太大,需要: - 适当延长前一个片段的结尾 - 或适当提前后一个片段的开始 - 确保观众能理解剧情是如何发展到下一个阶段的 ## 输出格式 请严格按照以下JSON格式输出: { "plot_points": [ { "sequence": 1, "timestamp": "00:01:23,456-00:02:15,789", "title": "情节标题", "narrative_function": "开端/发展/高潮/结局", "picture": "详细的画面和剧情描述,包括:场景环境、人物状态、关键对话、动作行为、情感表现", "context_before": "这个片段之前发生了什么(简述)", "context_after": "这个片段之后会发生什么(简述)", "transition_note": "与下一个片段的衔接说明" } ], "editing_notes": { "total_duration": "所有片段的总时长估算", "pacing_suggestion": "节奏建议(如:开头可稍慢,高潮处加快)", "potential_gaps": ["可能存在的剧情跳跃点及建议处理方式"] } } ## 质量检查清单 输出前请逐项检查: **时间戳检查**: - [ ] 所有时间戳都存在于原始字幕中 - [ ] 时间段之间没有重叠 - [ ] 时间段按顺序排列(后一个开始 > 前一个结束) **连贯性检查**: - [ ] 第一个片段是否交代了必要的背景? - [ ] 相邻片段之间的剧情跳跃是否可以理解? - [ ] 最后一个片段是否给出了某种结局或悬念? **完整性检查**: - [ ] 每个片段是否包含完整的小情节(起因-经过-结果)? - [ ] 观众只看这些片段能否理解整个故事的主线? ## 重要限制 1. **严禁虚构时间戳**:所有时间必须来自字幕 2. **严禁时间重叠**:这会导致剪辑出现重复画面 3. **严禁打乱顺序**:必须按剧情发展的时间线排列 4. **只输出JSON**:不要添加任何说明文字""" ================================================ FILE: app/services/prompts/short_drama_editing/subtitle_analysis.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 短剧混剪-剧情分析 @File : subtitle_analysis.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧字幕分析提示词 - 优化版本v2.0 """ from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat class SubtitleAnalysisPrompt(TextPrompt): """短剧字幕分析提示词 - 优化版本""" def __init__(self): metadata = PromptMetadata( name="subtitle_analysis", category="short_drama_editing", version="v2.0", description="分析短剧字幕内容,提取完整叙事结构和关键情节点,确保剧情连贯性", model_type=ModelType.TEXT, output_format=OutputFormat.JSON, tags=["短剧", "字幕分析", "剧情梗概", "情节提取", "叙事结构", "连贯性"], parameters=["subtitle_content", "custom_clips"] ) super().__init__(metadata) self._system_prompt = "你是一名资深短剧编剧和剪辑师,精通叙事结构和剧情节奏把控,擅长从字幕中提取能够形成完整故事线的关键情节。" def get_template(self) -> str: return """# 短剧混剪剧情分析任务 ## 任务目标 分析短剧字幕,提取能够组成**连贯完整故事**的关键情节点。最终混剪视频必须让观众能够理解剧情发展脉络。 ## 字幕内容 ${subtitle_content} ## 分析要求 ### 1. 叙事结构分析(必须完成) 按照经典叙事结构识别剧情阶段: - **开端(Setup)**:人物出场、背景交代、初始状态 - **发展(Rising Action)**:冲突引入、矛盾升级 - **高潮(Climax)**:核心冲突爆发、情感顶点 - **结局(Resolution)**:冲突解决、结果呈现 ### 2. 关键情节点选择原则 需要选择 ${custom_clips} 个情节点,必须遵循: **连贯性原则(最重要)**: - 情节点必须能串联成完整故事线 - 相邻情节点之间要有逻辑关联 - 观众看完后能理解"发生了什么" **覆盖性原则**: - 必须包含开端(至少1个):交代人物和背景 - 必须包含发展(至少1-2个):展示冲突升级 - 必须包含高潮(至少1个):最具张力的时刻 - 建议包含结局(如有):给观众交代 **戏剧性原则**: - 优先选择情感强烈的时刻 - 优先选择冲突明显的场景 - 优先选择有视觉冲击力的画面 ### 3. 情节点排序要求 - **严格按照剧情发展的时间顺序排列** - 确保因果关系清晰:先有A才有B - 避免时间线跳跃造成的理解障碍 ## 输出格式 请严格按照以下JSON格式输出,不要添加任何其他文字: { "summary": "整体剧情梗概(100-200字),包含主要人物、核心冲突、发展脉络和结局", "narrative_structure": { "setup": "开端阶段概述:人物和背景", "rising_action": "发展阶段概述:冲突如何升级", "climax": "高潮阶段概述:核心冲突点", "resolution": "结局阶段概述:如何收尾(如有)" }, "plot_titles": [ "[开端] 情节点1标题 - 简要说明其叙事功能", "[发展] 情节点2标题 - 简要说明其叙事功能", "[高潮] 情节点3标题 - 简要说明其叙事功能" ], "plot_connections": [ "情节1→情节2的逻辑关联说明", "情节2→情节3的逻辑关联说明" ], "analysis_details": { "main_characters": ["主要角色1(身份/特点)", "主要角色2(身份/特点)"], "core_conflict": "核心冲突是什么", "story_theme": "故事主题", "emotional_arc": "情感变化曲线(如:平静→震惊→愤怒→释然)" } } ## 质量检查清单 输出前请自检: 1. ✓ 情节点是否覆盖了故事的起承转合? 2. ✓ 相邻情节点之间是否有明确的逻辑关联? 3. ✓ 观众只看这些片段能否理解整个故事? 4. ✓ 情节点是否按时间顺序排列? 5. ✓ 是否避免了孤立的"爆点"而忽略上下文? ## 重要限制 1. 严禁虚构不存在的剧情内容 2. 必须基于字幕实际内容分析 3. 只输出JSON,不要任何说明文字""" ================================================ FILE: app/services/prompts/short_drama_narration/__init__.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : __init__.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧解说提示词模块 """ from .plot_analysis import PlotAnalysisPrompt from .script_generation import ScriptGenerationPrompt from ..manager import PromptManager def register_prompts(): """注册短剧解说相关的提示词""" # 注册剧情分析提示词 plot_analysis_prompt = PlotAnalysisPrompt() PromptManager.register_prompt(plot_analysis_prompt, is_default=True) # 注册解说脚本生成提示词 script_generation_prompt = ScriptGenerationPrompt() PromptManager.register_prompt(script_generation_prompt, is_default=True) __all__ = [ "PlotAnalysisPrompt", "ScriptGenerationPrompt", "register_prompts" ] ================================================ FILE: app/services/prompts/short_drama_narration/plot_analysis.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 短剧解说-剧情分析 @File : plot_analysis.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧剧情分析提示词 """ from ..base import TextPrompt, PromptMetadata, ModelType, OutputFormat class PlotAnalysisPrompt(TextPrompt): """短剧剧情分析提示词""" def __init__(self): metadata = PromptMetadata( name="plot_analysis", category="short_drama_narration", version="v1.0", description="分析短剧字幕内容,提供详细的剧情分析和分段解析", model_type=ModelType.TEXT, output_format=OutputFormat.TEXT, tags=["短剧", "剧情分析", "字幕解析", "分段分析"], parameters=["subtitle_content"] ) super().__init__(metadata) self._system_prompt = "你是一位专业的剧本分析师和剧情概括助手。" def get_template(self) -> str: return """# 角色 你是一位专业的剧本分析师和剧情概括助手。 # 任务 我将为你提供一部短剧的完整字幕文本。请你基于这些字幕,完成以下任务: 1. **整体剧情分析**:简要概括整个短剧的核心剧情脉络、主要冲突和结局(如果有的话)。 2. **分段剧情解析与时间戳定位**: * 将整个短剧划分为若干个关键的剧情段落(例如:开端、发展、转折、高潮、结局,或根据具体情节自然划分)。 * 段落数应该与字幕长度成正比。 * 对于每一个剧情段落: * **概括该段落的主要内容**:用简洁的语言描述这段剧情发生了什么。 * **标注对应的时间戳范围**:明确指出该剧情段落对应的开始字幕时间戳和结束字幕时间戳。请直接从字幕中提取时间信息。 # 输入格式 字幕内容通常包含时间戳和对话,例如: ``` 00:00:05,000 --> 00:00:10,000 [角色A]: 你好吗? 00:00:10,500 --> 00:00:15,000 [角色B]: 我很好,谢谢。发生了一些有趣的事情。 ... (更多字幕内容) ... ``` 我将把实际字幕粘贴在下方。 # 输出格式要求 请按照以下格式清晰地呈现分析结果: **一、整体剧情概括:** [此处填写对整个短剧剧情的概括] **二、分段剧情解析:** **剧情段落 1:[段落主题/概括,例如:主角登场与背景介绍]** * **时间戳:** [开始时间戳] --> [结束时间戳] * **内容概要:** [对这段剧情的详细描述] **剧情段落 2:[段落主题/概括,例如:第一个冲突出现]** * **时间戳:** [开始时间戳] --> [结束时间戳] * **内容概要:** [对这段剧情的详细描述] ... (根据实际剧情段落数量继续) ... **剧情段落 N:[段落主题/概括,例如:结局与反思]** * **时间戳:** [开始时间戳] --> [结束时间戳] * **内容概要:** [对这段剧情的详细描述] # 注意事项 * 请确保时间戳的准确性,直接引用字幕中的时间。 * 剧情段落的划分应合乎逻辑,能够反映剧情的起承转合。 * 语言表达应简洁、准确、客观。 # 限制 1. 严禁输出与分析结果无关的内容 2. 时间戳必须严格按照字幕中的实际时间 # 请处理以下字幕: ${subtitle_content}""" ================================================ FILE: app/services/prompts/short_drama_narration/script_generation.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: 短剧解说-文案画面匹配 @File : script_generation.py @Author : viccy同学 @Date : 2025/1/7 @Description: 短剧解说脚本生成提示词 - 优化版本 """ from ..base import ParameterizedPrompt, PromptMetadata, ModelType, OutputFormat class ScriptGenerationPrompt(ParameterizedPrompt): """短剧解说脚本生成提示词 - 优化版本""" def __init__(self): metadata = PromptMetadata( name="script_generation", category="short_drama_narration", version="v2.0", description="基于短剧解说创作核心要素,生成高质量解说脚本,包含黄金开场、爽点放大、个性吐槽等专业技巧", model_type=ModelType.TEXT, output_format=OutputFormat.JSON, tags=["短剧", "解说脚本", "文案生成", "原声片段", "黄金开场", "爽点放大", "个性吐槽", "悬念预埋"], parameters=["drama_name", "plot_analysis", "subtitle_content"] ) super().__init__(metadata, required_parameters=["drama_name", "plot_analysis"]) self._system_prompt = "你是一位顶级的短剧解说up主,精通短视频创作的所有核心技巧。你必须严格按照JSON格式输出,绝不能包含任何其他文字、说明或代码块标记。" def get_template(self) -> str: return """# 短剧解说脚本创作任务 ## 任务目标 我是一位专业的短剧解说up主,需要为短剧《${drama_name}》创作一份高质量的解说脚本。目标是让观众在短时间内了解剧情精华,并产生强烈的继续观看欲望。 ## 素材信息 ### 剧情概述 ${plot_analysis} ### 原始字幕(含精确时间戳) ${subtitle_content} ## 短剧解说创作核心要素 ### 1. 黄金开场(3秒法则) **开头3秒内必须制造强烈钩子,激发"想知道后续发展"的强烈好奇心** - **悬念设置**:直接抛出最核心的冲突或疑问 * 示例:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" * 技巧:直接定性角色身份和处境,制造紧张感 - **冲突展示**:展现最激烈的对立关系 * 示例:"而这一天,就在他刚露头的时候..." * 技巧:用时间节点强调关键时刻的到来 - **情感共鸣**:触及观众内心的普遍情感 - **反转预告**:暗示即将发生的惊人转折 * 技巧:使用"没想到"、"原来"、"竟然"等词汇预告反转 ### 2. 主线提炼(去繁就简) **快节奏解说,速度超越原剧,专注核心主线** - 舍弃次要情节和配角,只保留推动主线的关键人物 - 突出核心矛盾冲突,每个片段都要推进主要故事线 - 快速跳过铺垫,直击剧情要害 - 确保每个解说片段都有明确的剧情推进作用 - **转折技巧**:大量使用"而这时"、"就在这时"、"没多久"等时间转折词 ### 3. 爽点放大(情绪引爆) **精准识别剧中"爽点"并用富有感染力的语言放大** - **主角逆袭**:突出弱者变强、反败为胜的瞬间 - **反派被打脸**:强调恶人得到报应的痛快感 - **智商在线**:赞美角色的机智和策略 * 示例:"豺狼已经提前数日跟踪这名清洁工,并在他身上放了窃听器" * 技巧:展现角色的深谋远虑和专业能力 - **情感爆发**:放大感人、愤怒、震撼等强烈情绪 - 使用激昂语气和富有感染力的词汇调动观众情绪 ### 4. 个性吐槽(增加趣味) **以观众视角进行犀利点评,体现解说员独特人设** - 避免单纯复述剧情,要有自己的观点和态度 - **"上帝视角"分析技巧**: * 揭示角色内心:"他莫名地笑了一下" * 分析动机:"豺狼的这几步都是事先算好的" * 预判后果:"这又会有何代价呢" - 适当吐槽剧情的套路或角色的愚蠢行为 - 用幽默、犀利的语言增加观看趣味 - 站在观众立场,说出观众想说的话 - **心理活动描述**:深入角色内心,增强代入感 ### 5. 悬念预埋(引导互动) **在关键节点和结尾处"卖关子",激发互动欲望** - 在剧情高潮前停止,留下"接下来会发生什么"的疑问 - **悬念设置技巧**: * 问题抛出:"那么,UDC究竟是谁呢?" * 反转预告:"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解" * 时间悬念:"几分钟后..."、"不久之后..." - 提出引导性问题:"你们觉得他会怎么做?" - 预告后续精彩:"更劲爆的还在后面" - 为后续内容预热,激发评论、点赞、关注 ### 6. 卡点配合(视听协调) **考虑文案与画面、音乐的完美结合** - 在情感高潮处预设BGM卡点 - 解说节奏要配合画面节奏 - 重要台词处保留原声,解说适时停顿 - 追求文案+画面+音乐的协同效应 ## 专业解说语言技巧 ### 1. 氛围营造技巧 **通过环境和细节描述增强画面感和代入感** - **环境描述**:"在这个距离,枪声都无法传到那边" - **细节刻画**:"他的床头有酒,身边的纸碟堆满烟头" - **氛围渲染**:"黑暗树林里有一间仓房" - **情绪描述**:"孤独又无助的豺狼,竟在这时露出了反常的一面" ### 2. 情感词汇运用 **使用富有感染力的词汇调动观众情绪** - **紧张感**:"名声恶臭"、"早晚会被暗杀"、"动用军警资源" - **神秘感**:"尘封的传奇"、"高度机密"、"暗藏玄机" - **震撼感**:"空前绝后的一枪"、"天衣无缝"、"神不知鬼不觉" - **悲伤感**:"目光非常悲伤"、"注定永远无法哀悼" ### 3. 节奏控制技巧 **通过语言节奏控制观众注意力** - **快节奏推进**:使用短句,密集信息 - **慢节奏渲染**:使用长句,详细描述 - **停顿技巧**:在关键信息前适当停顿 - **重复强调**:重要信息适当重复 ## 严格技术要求 ### 时间戳管理(绝对不能违反) - **时间戳绝对不能重叠**,确保剪辑后无重复画面 - **时间段必须连续且不交叉**,严格按时间顺序排列 - **每个时间戳都必须在原始字幕中找到对应范围** - 可以拆分原时间片段,但必须保持时间连续性 - 时间戳的格式必须与原始字幕中的格式完全一致 ### 时长控制(1/3原则) - **解说视频总长度 = 原视频长度的 1/3** - 精确控制节奏和密度,既不能过短也不能过长 - 合理分配解说和原声的时间比例 ### 剧情连贯性 - **保持故事逻辑完整**,确保情节发展自然流畅 - **严格按照时间顺序**,禁止跳跃式叙述 - **符合因果逻辑**:先发生A,再发生B,A导致B ## 原声片段使用规范 ### 原声片段格式要求 原声片段必须严格按照以下JSON格式: ```json { "_id": 序号, "timestamp": "开始时间-结束时间", "picture": "画面内容描述", "narration": "播放原片+序号", "OST": 1 } ``` ### 原声片段插入策略 #### 1. 关键情绪爆发点 **在角色强烈情绪表达时必须保留原声,增强观众代入感** - **愤怒爆发**:角色愤怒咆哮、情绪失控的瞬间 * 参考:"Come on, you bastard. Reaching."(愤怒对峙) - **感动落泪**:角色感动哭泣、情感宣泄的时刻 - **震惊反应**:角色震惊、不敢置信的表情和台词 * 参考:"Are you sure about that?"(质疑震惊) - **绝望崩溃**:角色绝望、崩溃的情感表达 * 参考:"Charles you're scaring me, what's wrong"(恐惧绝望) - **狂欢庆祝**:角色兴奋、狂欢的情绪高潮 #### 2. 重要对白时刻 **保留推动剧情发展的关键台词和对话** - **身份揭露**:揭示角色真实身份的重要台词 - **真相大白**:揭晓谜底、真相的关键对话 - **情感告白**:爱情告白、情感表达的重要台词 * 参考:"i'm really not good"(情感表达) - **威胁警告**:反派威胁、警告的重要对白 * 参考:"You do not want to make enemies of these people"(威胁警告) - **决定宣布**:角色做出重要决定的宣告 #### 3. 爽点瞬间 **在"爽点"时刻保留原声增强痛快感** - **主角逆袭**:弱者反击、逆转局面的台词 - **反派被打脸**:恶人得到报应、被揭穿的瞬间 - **智商碾压**:主角展现智慧、碾压对手的台词 * 参考:"That is a fucking work of art guys"(技能展示) - **正义伸张**:正义得到伸张、恶有恶报的时刻 - **实力展现**:主角展现真实实力、震撼全场 #### 4. 悬念节点 **在制造悬念或揭晓答案的关键时刻保留原声** - **悬念制造**:制造悬念、留下疑问的台词 - **答案揭晓**:揭晓答案、解开谜团的对话 - **转折预告**:暗示即将发生转折的重要台词 - **危机降临**:危机来临、紧张时刻的对白 #### 5. 经典台词时刻 **保留具有强烈感染力和记忆点的经典台词** - **哲理感悟**:角色的人生感悟和哲理思考 - **幽默调侃**:轻松幽默的对话增加趣味性 - **专业术语**:体现角色专业性的术语和对话 * 参考:"The scanner will pick up the metal components"(专业解释) - **情感共鸣**:能引起观众共鸣的经典表达 ### 原声片段技术规范 #### 格式规范 - **OST字段**:设置为1表示保留原声(解说片段设置为0) - **narration格式**:严格使用"播放原片+序号"(如"播放原片26") - **picture字段**:详细描述画面内容,便于后期剪辑参考 - **时间戳精度**:必须与字幕中的重要对白时间精确匹配 #### 比例控制 - **原声与解说比例**:7:3(原声70%,解说30%) - **分布均匀**:原声片段要在整个视频中均匀分布 - **长度适中**:单个原声片段时长控制在3-8秒 - **衔接自然**:原声片段与解说片段之间衔接自然流畅 #### 选择原则 - **情感优先**:优先选择情感强烈的台词和对话 - **剧情关键**:必须是推动剧情发展的重要内容 - **观众共鸣**:选择能引起观众共鸣的经典台词 - **视听效果**:考虑台词的声音效果和表演张力 - **代入感强**:选择能让观众产生强烈代入感的对话 ## 输出格式要求 请严格按照以下JSON格式输出,绝不添加任何其他文字、说明或代码块标记: { "items": [ { "_id": 1, "timestamp": "00:00:01,000-00:00:05,500", "picture": "女主角林小雨慌张地道歉,男主角沈墨轩冷漠地看着她", "narration": "一个普通女孩的命运即将因为一杯咖啡彻底改变!她撞到的这个男人,竟然是...", "OST": 0 }, { "_id": 2, "timestamp": "00:00:05,500-00:00:08,000", "picture": "沈墨轩质问林小雨,语气冷厉威严", "narration": "播放原片2", "OST": 1 }, { "_id": 3, "timestamp": "00:00:08,000-00:00:12,000", "picture": "林小雨惊慌失措,沈墨轩眼中闪过一丝兴趣", "narration": "霸道总裁的经典开场!一杯咖啡引发的爱情故事就这样开始了...", "OST": 0 } ] } ## 质量标准 ### 解说文案要求: - **字数控制**:每段解说文案80-150字 - **语言风格**:生动有趣,富有感染力,符合短视频观众喜好 * 参考风格:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" * 直接定性,制造紧张感和代入感 - **情感调动**:能够有效调动观众情绪,产生代入感 * 使用"而这时"、"没想到"、"原来"等转折词增强戏剧性 - **节奏把控**:快节奏但不失条理,紧凑但不混乱 * 短句推进剧情,长句渲染氛围 ### 技术规范: - **解说与原片比例**:3:7(解说30%,原片70%) - **原声片段标识**:OST=1表示原声,OST=0表示解说 - **原声格式规范**:narration字段必须使用"播放原片+序号"格式 - **关键情绪点**:必须保留原片原声,增强观众代入感 - **时间戳精度**:精确到毫秒级别,确保与字幕完美匹配 - **逻辑连贯性**:严格遵循剧情发展顺序 ### 创作原则: 1. **只输出JSON内容**,不要任何说明性文字 2. **严格基于提供的剧情和字幕**,不虚构内容 3. **突出核心冲突**,舍弃无关细节 4. **强化观众体验**,始终考虑观看感受 5. **保持专业水准**,体现解说up主的专业素养 6. **融入经典解说技巧**: - 大量使用"上帝视角"分析 - 适时插入心理活动描述 - 运用悬念设置和反转技巧 - 保持强烈的画面感和代入感 ### 参考解说风格示例: - **开场悬念**:"身为一个名声恶臭的政客,他知道自己早晚会被暗杀" - **转折技巧**:"而这一天,就在他刚露头的时候..." - **上帝视角**:"豺狼已经提前数日跟踪这名清洁工" - **情感渲染**:"孤独又无助的豺狼,竟在这时露出了反常的一面" - **悬念设置**:"那么,UDC究竟是谁呢?" - **反转预告**:"而从这句话开始,所有的专业、体面和虚伪的平静都将分崩瓦解" 现在请基于以上要求,为短剧《${drama_name}》创作解说脚本:""" ================================================ FILE: app/services/prompts/template.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : template.py @Author : viccy同学 @Date : 2025/1/7 @Description: 模板渲染引擎 """ import re from typing import Dict, Any, List, Optional from string import Template from loguru import logger from .exceptions import TemplateRenderError class TemplateRenderer: """模板渲染器""" def __init__(self): self._custom_filters = {} def register_filter(self, name: str, func: callable) -> None: """注册自定义过滤器""" self._custom_filters[name] = func logger.debug(f"已注册模板过滤器: {name}") def render(self, template: str, parameters: Dict[str, Any] = None) -> str: """ 渲染模板 Args: template: 模板字符串 parameters: 参数字典 Returns: 渲染后的字符串 """ parameters = parameters or {} try: # 使用简单的字符串替换进行参数替换 rendered = template for key, value in parameters.items(): # 替换 ${key} 格式的参数 rendered = rendered.replace(f"${{{key}}}", str(value)) # 也替换 $key 格式的参数(为了兼容性) rendered = rendered.replace(f"${key}", str(value)) # 处理自定义过滤器 rendered = self._apply_filters(rendered, parameters) return rendered except Exception as e: raise TemplateRenderError( template_name="unknown", error_message=f"模板渲染失败: {str(e)}" ) def _apply_filters(self, text: str, parameters: Dict[str, Any]) -> str: """应用自定义过滤器""" # 查找过滤器模式: ${variable|filter_name} filter_pattern = r'\$\{([^}]+)\|([^}]+)\}' def replace_filter(match): var_name = match.group(1).strip() filter_name = match.group(2).strip() if filter_name not in self._custom_filters: logger.warning(f"未知的过滤器: {filter_name}") return match.group(0) # 返回原始文本 if var_name not in parameters: logger.warning(f"参数不存在: {var_name}") return match.group(0) # 返回原始文本 try: filter_func = self._custom_filters[filter_name] filtered_value = filter_func(parameters[var_name]) return str(filtered_value) except Exception as e: logger.error(f"过滤器执行失败 {filter_name}: {str(e)}") return match.group(0) # 返回原始文本 return re.sub(filter_pattern, replace_filter, text) def extract_variables(self, template: str) -> List[str]: """提取模板中的变量名""" # 匹配 ${variable} 和 ${variable|filter} 模式 pattern = r'\$\{([^}|]+)(?:\|[^}]+)?\}' matches = re.findall(pattern, template) return list(set(match.strip() for match in matches)) def validate_template(self, template: str, required_params: List[str] = None) -> bool: """验证模板""" try: # 提取模板变量 template_vars = self.extract_variables(template) # 检查必需参数 if required_params: missing_params = set(required_params) - set(template_vars) if missing_params: raise TemplateRenderError( template_name="validation", error_message="模板缺少必需参数", missing_params=list(missing_params) ) # 尝试渲染测试 test_params = {var: f"test_{var}" for var in template_vars} self.render(template, test_params) return True except Exception as e: logger.error(f"模板验证失败: {str(e)}") return False # 内置过滤器 def _upper_filter(value: Any) -> str: """转换为大写""" return str(value).upper() def _lower_filter(value: Any) -> str: """转换为小写""" return str(value).lower() def _title_filter(value: Any) -> str: """转换为标题格式""" return str(value).title() def _strip_filter(value: Any) -> str: """去除首尾空白""" return str(value).strip() def _truncate_filter(value: Any, length: int = 100) -> str: """截断文本""" text = str(value) if len(text) <= length: return text return text[:length] + "..." def _json_filter(value: Any) -> str: """转换为JSON字符串""" import json return json.dumps(value, ensure_ascii=False, indent=2) # 全局渲染器实例 _global_renderer = TemplateRenderer() # 注册内置过滤器 _global_renderer.register_filter("upper", _upper_filter) _global_renderer.register_filter("lower", _lower_filter) _global_renderer.register_filter("title", _title_filter) _global_renderer.register_filter("strip", _strip_filter) _global_renderer.register_filter("truncate", _truncate_filter) _global_renderer.register_filter("json", _json_filter) def get_renderer() -> TemplateRenderer: """获取全局渲染器实例""" return _global_renderer def render_template(template: str, parameters: Dict[str, Any] = None) -> str: """便捷的模板渲染函数""" return _global_renderer.render(template, parameters) ================================================ FILE: app/services/prompts/validators.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : validators.py @Author : viccy同学 @Date : 2025/1/7 @Description: 提示词输出验证器 """ import json import re from typing import Dict, Any, List, Optional, Union from loguru import logger from .base import OutputFormat from .exceptions import PromptValidationError class PromptOutputValidator: """提示词输出验证器""" @staticmethod def validate_json(output: str, schema: Dict[str, Any] = None) -> Dict[str, Any]: """ 验证JSON输出 Args: output: 输出字符串 schema: JSON schema(可选) Returns: 解析后的JSON对象 """ try: # 清理输出(移除可能的代码块标记) cleaned_output = PromptOutputValidator._clean_json_output(output) # 解析JSON parsed = json.loads(cleaned_output) # Schema验证(如果提供) if schema: PromptOutputValidator._validate_json_schema(parsed, schema) return parsed except json.JSONDecodeError as e: raise PromptValidationError(f"JSON格式错误: {str(e)}") except Exception as e: raise PromptValidationError(f"JSON验证失败: {str(e)}") @staticmethod def validate_narration_script(output: Union[str, Dict]) -> Dict[str, Any]: """ 验证解说文案输出格式 Args: output: 输出内容(字符串或字典) Returns: 验证后的解说文案数据 """ # 如果是字符串,先解析为JSON if isinstance(output, str): data = PromptOutputValidator.validate_json(output) else: data = output # 验证必需字段 if "items" not in data: raise PromptValidationError("解说文案缺少 'items' 字段") items = data["items"] if not isinstance(items, list): raise PromptValidationError("'items' 字段必须是数组") if not items: raise PromptValidationError("解说文案不能为空") # 验证每个item for i, item in enumerate(items): PromptOutputValidator._validate_narration_item(item, i) logger.debug(f"解说文案验证通过,包含 {len(items)} 个片段") return data @staticmethod def validate_plot_analysis(output: Union[str, Dict]) -> Dict[str, Any]: """ 验证剧情分析输出格式 Args: output: 输出内容 Returns: 验证后的剧情分析数据 """ if isinstance(output, str): data = PromptOutputValidator.validate_json(output) else: data = output # 验证剧情分析必需字段 required_fields = ["summary", "plot_points"] for field in required_fields: if field not in data: raise PromptValidationError(f"剧情分析缺少 '{field}' 字段") # 验证plot_points plot_points = data["plot_points"] if not isinstance(plot_points, list): raise PromptValidationError("'plot_points' 字段必须是数组") for i, point in enumerate(plot_points): PromptOutputValidator._validate_plot_point(point, i) logger.debug(f"剧情分析验证通过,包含 {len(plot_points)} 个情节点") return data @staticmethod def _clean_json_output(output: str) -> str: """清理JSON输出""" # 移除可能的代码块标记 output = re.sub(r'^```json\s*', '', output, flags=re.MULTILINE) output = re.sub(r'^```\s*$', '', output, flags=re.MULTILINE) # 移除前后空白 output = output.strip() # 尝试提取JSON部分(如果有其他文本) json_match = re.search(r'\{.*\}', output, re.DOTALL) if json_match: output = json_match.group(0) return output @staticmethod def _validate_json_schema(data: Dict[str, Any], schema: Dict[str, Any]) -> None: """验证JSON Schema""" # 简单的schema验证实现 for field, field_type in schema.items(): if field not in data: raise PromptValidationError(f"缺少必需字段: {field}") if not isinstance(data[field], field_type): raise PromptValidationError( f"字段 '{field}' 类型错误,期望: {field_type.__name__},实际: {type(data[field]).__name__}" ) @staticmethod def _validate_narration_item(item: Dict[str, Any], index: int) -> None: """验证解说文案项目""" required_fields = ["_id", "timestamp", "picture", "narration"] for field in required_fields: if field not in item: raise PromptValidationError(f"第 {index + 1} 个片段缺少 '{field}' 字段") # 验证_id if not isinstance(item["_id"], int) or item["_id"] <= 0: raise PromptValidationError(f"第 {index + 1} 个片段的 '_id' 必须是正整数") # 验证timestamp格式 timestamp = item["timestamp"] if not isinstance(timestamp, str): raise PromptValidationError(f"第 {index + 1} 个片段的 'timestamp' 必须是字符串") # 验证时间戳格式 (HH:MM:SS,mmm-HH:MM:SS,mmm) timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$' if not re.match(timestamp_pattern, timestamp): raise PromptValidationError( f"第 {index + 1} 个片段的时间戳格式错误,应为 'HH:MM:SS,mmm-HH:MM:SS,mmm'" ) # 验证文本字段不为空 for field in ["picture", "narration"]: if not isinstance(item[field], str) or not item[field].strip(): raise PromptValidationError(f"第 {index + 1} 个片段的 '{field}' 不能为空") # 验证OST字段(如果存在) if "OST" in item: if not isinstance(item["OST"], int) or item["OST"] not in [0, 1, 2]: raise PromptValidationError( f"第 {index + 1} 个片段的 'OST' 必须是 0、1 或 2" ) @staticmethod def _validate_plot_point(point: Dict[str, Any], index: int) -> None: """验证剧情点""" required_fields = ["timestamp", "title", "picture"] for field in required_fields: if field not in point: raise PromptValidationError(f"第 {index + 1} 个剧情点缺少 '{field}' 字段") # 验证字段类型和内容 for field in required_fields: if not isinstance(point[field], str) or not point[field].strip(): raise PromptValidationError(f"第 {index + 1} 个剧情点的 '{field}' 不能为空") # 验证时间戳格式 timestamp = point["timestamp"] # 支持多种时间戳格式 patterns = [ r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$', # HH:MM:SS,mmm-HH:MM:SS,mmm r'^\d{2}:\d{2}:\d{2}-\d{2}:\d{2}:\d{2}$', # HH:MM:SS-HH:MM:SS ] if not any(re.match(pattern, timestamp) for pattern in patterns): raise PromptValidationError( f"第 {index + 1} 个剧情点的时间戳格式错误" ) @staticmethod def validate_by_format(output: str, format_type: OutputFormat, schema: Dict[str, Any] = None) -> Any: """ 根据格式类型验证输出 Args: output: 输出内容 format_type: 输出格式类型 schema: 验证schema(可选) Returns: 验证后的数据 """ if format_type == OutputFormat.JSON: return PromptOutputValidator.validate_json(output, schema) elif format_type == OutputFormat.TEXT: return output.strip() elif format_type == OutputFormat.MARKDOWN: return output.strip() elif format_type == OutputFormat.STRUCTURED: # 结构化数据需要根据具体类型处理 return PromptOutputValidator.validate_json(output, schema) else: raise PromptValidationError(f"不支持的输出格式: {format_type}") # 便捷函数 def validate_json_output(output: str, schema: Dict[str, Any] = None) -> Dict[str, Any]: """验证JSON输出的便捷函数""" return PromptOutputValidator.validate_json(output, schema) def validate_narration_output(output: Union[str, Dict]) -> Dict[str, Any]: """验证解说文案输出的便捷函数""" return PromptOutputValidator.validate_narration_script(output) ================================================ FILE: app/services/script_service.py ================================================ import os import json import time import asyncio import requests from app.utils import video_processor from loguru import logger from typing import List, Dict, Any, Callable from app.utils import utils, gemini_analyzer, video_processor from app.utils.script_generator import ScriptProcessor from app.config import config class ScriptGenerator: def __init__(self): self.temp_dir = utils.temp_dir() self.keyframes_dir = os.path.join(self.temp_dir, "keyframes") async def generate_script( self, video_path: str, video_theme: str = "", custom_prompt: str = "", frame_interval_input: int = 5, skip_seconds: int = 0, threshold: int = 30, vision_batch_size: int = 5, vision_llm_provider: str = "gemini", progress_callback: Callable[[float, str], None] = None ) -> List[Dict[Any, Any]]: """ 生成视频脚本的核心逻辑 Args: video_path: 视频文件路径 video_theme: 视频主题 custom_prompt: 自定义提示词 skip_seconds: 跳过开始的秒数 threshold: 差异���值 vision_batch_size: 视觉处理批次大小 vision_llm_provider: 视觉模型提供商 progress_callback: 进度回调函数 Returns: List[Dict]: 生成的视频脚本 """ if progress_callback is None: progress_callback = lambda p, m: None try: # 提取关键帧 progress_callback(10, "正在提取关键帧...") keyframe_files = await self._extract_keyframes( video_path, skip_seconds, threshold ) # 使用统一的 LLM 接口(支持所有 provider) script = await self._process_with_llm( keyframe_files, video_theme, custom_prompt, vision_batch_size, vision_llm_provider, progress_callback ) return json.loads(script) if isinstance(script, str) else script except Exception as e: logger.exception("Generate script failed") raise async def _extract_keyframes( self, video_path: str, skip_seconds: int, threshold: int ) -> List[str]: """提取视频关键帧""" video_hash = utils.md5(video_path + str(os.path.getmtime(video_path))) video_keyframes_dir = os.path.join(self.keyframes_dir, video_hash) # 检查缓存 keyframe_files = [] if os.path.exists(video_keyframes_dir): for filename in sorted(os.listdir(video_keyframes_dir)): if filename.endswith('.jpg'): keyframe_files.append(os.path.join(video_keyframes_dir, filename)) if keyframe_files: logger.info(f"Using cached keyframes: {video_keyframes_dir}") return keyframe_files # 提取新的关键帧 os.makedirs(video_keyframes_dir, exist_ok=True) try: processor = video_processor.VideoProcessor(video_path) processor.process_video_pipeline( output_dir=video_keyframes_dir, skip_seconds=skip_seconds, threshold=threshold ) for filename in sorted(os.listdir(video_keyframes_dir)): if filename.endswith('.jpg'): keyframe_files.append(os.path.join(video_keyframes_dir, filename)) return keyframe_files except Exception as e: if os.path.exists(video_keyframes_dir): import shutil shutil.rmtree(video_keyframes_dir) raise async def _process_with_llm( self, keyframe_files: List[str], video_theme: str, custom_prompt: str, vision_batch_size: int, vision_llm_provider: str, progress_callback: Callable[[float, str], None] ) -> str: """使用统一 LLM 接口处理视频帧""" progress_callback(30, "正在初始化视觉分析器...") # 使用新的 LLM 迁移适配器(支持所有 provider) from app.services.llm.migration_adapter import create_vision_analyzer # 获取配置 text_provider = config.app.get('text_llm_provider', 'litellm').lower() vision_api_key = config.app.get(f'vision_{vision_llm_provider}_api_key') vision_model = config.app.get(f'vision_{vision_llm_provider}_model_name') vision_base_url = config.app.get(f'vision_{vision_llm_provider}_base_url') if not vision_api_key or not vision_model: raise ValueError(f"未配置 {vision_llm_provider} API Key 或者模型") # 创建统一的视觉分析器 analyzer = create_vision_analyzer( provider=vision_llm_provider, api_key=vision_api_key, model=vision_model, base_url=vision_base_url ) progress_callback(40, "正在分析关键帧...") # 执行异步分析 results = await analyzer.analyze_images( images=keyframe_files, prompt=config.app.get('vision_analysis_prompt'), batch_size=vision_batch_size ) progress_callback(60, "正在整理分析结果...") # 合并所有批次的分析结果 frame_analysis = "" prev_batch_files = None for result in results: if 'error' in result: logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}") continue batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size) first_timestamp, last_timestamp, _ = self._get_batch_timestamps(batch_files, prev_batch_files) # 添加带时间戳的分��结果 frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" frame_analysis += result['response'] frame_analysis += "\n" prev_batch_files = batch_files if not frame_analysis.strip(): raise Exception("未能生成有效的帧分析结果") progress_callback(70, "正在生成脚本...") # 构建帧内容列表 frame_content_list = [] prev_batch_files = None for result in results: if 'error' in result: continue batch_files = self._get_batch_files(keyframe_files, result, vision_batch_size) _, _, timestamp_range = self._get_batch_timestamps(batch_files, prev_batch_files) frame_content = { "timestamp": timestamp_range, "picture": result['response'], "narration": "", "OST": 2 } frame_content_list.append(frame_content) prev_batch_files = batch_files if not frame_content_list: raise Exception("没有有效的帧内容可以处理") progress_callback(90, "正在生成文案...") # 获取文本生��配置 text_provider = config.app.get('text_llm_provider', 'gemini').lower() text_api_key = config.app.get(f'text_{text_provider}_api_key') text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') # 根据提供商类型选择合适的处理器 if text_provider == 'gemini(openai)': # 使用OpenAI兼容的Gemini代理 from app.utils.script_generator import GeminiOpenAIGenerator generator = GeminiOpenAIGenerator( model_name=text_model, api_key=text_api_key, prompt=custom_prompt, base_url=text_base_url ) processor = ScriptProcessor( model_name=text_model, api_key=text_api_key, base_url=text_base_url, prompt=custom_prompt, video_theme=video_theme ) processor.generator = generator else: # 使用标准处理器(包括原生Gemini) processor = ScriptProcessor( model_name=text_model, api_key=text_api_key, base_url=text_base_url, prompt=custom_prompt, video_theme=video_theme ) return processor.process_frames(frame_content_list) def _get_batch_files( self, keyframe_files: List[str], result: Dict[str, Any], batch_size: int ) -> List[str]: """获取当前批次的图片文件""" batch_start = result['batch_index'] * batch_size batch_end = min(batch_start + batch_size, len(keyframe_files)) return keyframe_files[batch_start:batch_end] def _get_batch_timestamps( self, batch_files: List[str], prev_batch_files: List[str] = None ) -> tuple[str, str, str]: """获取一批文件的时间戳范围,支持毫秒级精度""" if not batch_files: logger.warning("Empty batch files") return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000" if len(batch_files) == 1 and prev_batch_files and len(prev_batch_files) > 0: first_frame = os.path.basename(prev_batch_files[-1]) last_frame = os.path.basename(batch_files[0]) else: first_frame = os.path.basename(batch_files[0]) last_frame = os.path.basename(batch_files[-1]) first_time = first_frame.split('_')[2].replace('.jpg', '') last_time = last_frame.split('_')[2].replace('.jpg', '') def format_timestamp(time_str: str) -> str: """将时间字符串转换为 HH:MM:SS,mmm 格式""" try: if len(time_str) < 4: logger.warning(f"Invalid timestamp format: {time_str}") return "00:00:00,000" # 处理毫秒部分 if ',' in time_str: time_part, ms_part = time_str.split(',') ms = int(ms_part) else: time_part = time_str ms = 0 # 处理时分秒 parts = time_part.split(':') if len(parts) == 3: # HH:MM:SS h, m, s = map(int, parts) elif len(parts) == 2: # MM:SS h = 0 m, s = map(int, parts) else: # SS h = 0 m = 0 s = int(parts[0]) # 处理进位 if s >= 60: m += s // 60 s = s % 60 if m >= 60: h += m // 60 m = m % 60 return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" except Exception as e: logger.error(f"时间戳格式转换错误 {time_str}: {str(e)}") return "00:00:00,000" first_timestamp = format_timestamp(first_time) last_timestamp = format_timestamp(last_time) timestamp_range = f"{first_timestamp}-{last_timestamp}" return first_timestamp, last_timestamp, timestamp_range ================================================ FILE: app/services/state.py ================================================ import ast from abc import ABC, abstractmethod from app.config import config from app.models import const # Base class for state management class BaseState(ABC): @abstractmethod def update_task(self, task_id: str, state: int, progress: int = 0, **kwargs): pass @abstractmethod def get_task(self, task_id: str): pass # Memory state management class MemoryState(BaseState): def __init__(self): self._tasks = {} def update_task( self, task_id: str, state: int = const.TASK_STATE_PROCESSING, progress: int = 0, **kwargs, ): progress = int(progress) if progress > 100: progress = 100 self._tasks[task_id] = { "state": state, "progress": progress, **kwargs, } def get_task(self, task_id: str): return self._tasks.get(task_id, None) def delete_task(self, task_id: str): if task_id in self._tasks: del self._tasks[task_id] # Redis state management class RedisState(BaseState): def __init__(self, host="localhost", port=6379, db=0, password=None): import redis self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password) def update_task( self, task_id: str, state: int = const.TASK_STATE_PROCESSING, progress: int = 0, **kwargs, ): progress = int(progress) if progress > 100: progress = 100 fields = { "state": state, "progress": progress, **kwargs, } for field, value in fields.items(): self._redis.hset(task_id, field, str(value)) def get_task(self, task_id: str): task_data = self._redis.hgetall(task_id) if not task_data: return None task = { key.decode("utf-8"): self._convert_to_original_type(value) for key, value in task_data.items() } return task def delete_task(self, task_id: str): self._redis.delete(task_id) @staticmethod def _convert_to_original_type(value): """ Convert the value from byte string to its original data type. You can extend this method to handle other data types as needed. """ value_str = value.decode("utf-8") try: # try to convert byte string array to list return ast.literal_eval(value_str) except (ValueError, SyntaxError): pass if value_str.isdigit(): return int(value_str) # Add more conversions here if needed return value_str # Global state _enable_redis = config.app.get("enable_redis", False) _redis_host = config.app.get("redis_host", "localhost") _redis_port = config.app.get("redis_port", 6379) _redis_db = config.app.get("redis_db", 0) _redis_password = config.app.get("redis_password", None) state = ( RedisState( host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password ) if _enable_redis else MemoryState() ) ================================================ FILE: app/services/subtitle.py ================================================ import json import os.path import re import traceback from typing import Optional # from faster_whisper import WhisperModel from timeit import default_timer as timer from loguru import logger import google.generativeai as genai from moviepy import VideoFileClip import os from app.config import config from app.utils import utils model_size = config.whisper.get("model_size", "faster-whisper-large-v2") device = config.whisper.get("device", "cpu") compute_type = config.whisper.get("compute_type", "int8") model = None def create(audio_file, subtitle_file: str = ""): """ 为给定的音频文件创建字幕文件。 参数: - audio_file: 音频文件的路径。 - subtitle_file: 字幕文件的输出路径(可选)。如果未提供,将根据音频文件的路径生成字幕文件。 返回: 无返回值,但会在指定路径生成字幕文件。 """ global model, device, compute_type if not model: model_path = f"{utils.root_dir()}/app/models/faster-whisper-large-v3" model_bin_file = f"{model_path}/model.bin" if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file): logger.error( "请先下载 whisper 模型\n\n" "********************************************\n" "下载地址:https://huggingface.co/guillaumekln/faster-whisper-large-v2\n" "存放路径:app/models \n" "********************************************\n" ) return None # 首先使用CPU模式,不触发CUDA检查 use_cuda = False try: # 在函数中延迟导入torch,而不是在全局范围内 # 使用安全的方式检查CUDA可用性 def check_cuda_available(): try: import torch return torch.cuda.is_available() except (ImportError, RuntimeError) as e: logger.warning(f"检查CUDA可用性时出错: {e}") return False # 仅当明确需要时才检查CUDA use_cuda = check_cuda_available() if use_cuda: logger.info(f"尝试使用 CUDA 加载模型: {model_path}") try: model = WhisperModel( model_size_or_path=model_path, device="cuda", compute_type="float16", local_files_only=True ) device = "cuda" compute_type = "float16" logger.info("成功使用 CUDA 加载模型") except Exception as e: logger.warning(f"CUDA 加载失败,错误信息: {str(e)}") logger.warning("回退到 CPU 模式") use_cuda = False else: logger.info("使用 CPU 模式") except Exception as e: logger.warning(f"CUDA检查过程出错: {e}") logger.warning("默认使用CPU模式") use_cuda = False # 如果CUDA不可用或加载失败,使用CPU if not use_cuda: device = "cpu" compute_type = "int8" logger.info(f"使用 CPU 加载模型: {model_path}") model = WhisperModel( model_size_or_path=model_path, device=device, compute_type=compute_type, local_files_only=True ) logger.info(f"模型加载完成,使用设备: {device}, 计算类型: {compute_type}") logger.info(f"start, output file: {subtitle_file}") if not subtitle_file: subtitle_file = f"{audio_file}.srt" segments, info = model.transcribe( audio_file, beam_size=5, word_timestamps=True, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500), initial_prompt="以下是普通话的句子" ) logger.info( f"检测到的语言: '{info.language}', probability: {info.language_probability:.2f}" ) start = timer() subtitles = [] def recognized(seg_text, seg_start, seg_end): seg_text = seg_text.strip() if not seg_text: return msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text) logger.debug(msg) subtitles.append( {"msg": seg_text, "start_time": seg_start, "end_time": seg_end} ) for segment in segments: words_idx = 0 words_len = len(segment.words) seg_start = 0 seg_end = 0 seg_text = "" if segment.words: is_segmented = False for word in segment.words: if not is_segmented: seg_start = word.start is_segmented = True seg_end = word.end # 如果包含标点,则断句 seg_text += word.word if utils.str_contains_punctuation(word.word): # remove last char seg_text = seg_text[:-1] if not seg_text: continue recognized(seg_text, seg_start, seg_end) is_segmented = False seg_text = "" if words_idx == 0 and segment.start < word.start: seg_start = word.start if words_idx == (words_len - 1) and segment.end > word.end: seg_end = word.end words_idx += 1 if not seg_text: continue recognized(seg_text, seg_start, seg_end) end = timer() diff = end - start logger.info(f"complete, elapsed: {diff:.2f} s") idx = 1 lines = [] for subtitle in subtitles: text = subtitle.get("msg") if text: lines.append( utils.text_to_srt( idx, text, subtitle.get("start_time"), subtitle.get("end_time") ) ) idx += 1 sub = "\n".join(lines) + "\n" with open(subtitle_file, "w", encoding="utf-8") as f: f.write(sub) logger.info(f"subtitle file created: {subtitle_file}") def file_to_subtitles(filename): """ 将字幕文件转换为字幕列表。 参数: filename (str): 字幕文件的路径。 返回: list: 包含字幕序号、出现时间、和字幕文本的元组列表。 """ if not filename or not os.path.isfile(filename): return [] times_texts = [] current_times = None current_text = "" index = 0 with open(filename, "r", encoding="utf-8") as f: for line in f: times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line) if times: current_times = line elif line.strip() == "" and current_times: index += 1 times_texts.append((index, current_times.strip(), current_text.strip())) current_times, current_text = None, "" elif current_times: current_text += line return times_texts def levenshtein_distance(s1, s2): if len(s1) < len(s2): return levenshtein_distance(s2, s1) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1] def similarity(a, b): distance = levenshtein_distance(a.lower(), b.lower()) max_length = max(len(a), len(b)) return 1 - (distance / max_length) def correct(subtitle_file, video_script): subtitle_items = file_to_subtitles(subtitle_file) script_lines = utils.split_string_by_punctuations(video_script) corrected = False new_subtitle_items = [] script_index = 0 subtitle_index = 0 while script_index < len(script_lines) and subtitle_index < len(subtitle_items): script_line = script_lines[script_index].strip() subtitle_line = subtitle_items[subtitle_index][2].strip() if script_line == subtitle_line: new_subtitle_items.append(subtitle_items[subtitle_index]) script_index += 1 subtitle_index += 1 else: combined_subtitle = subtitle_line start_time = subtitle_items[subtitle_index][1].split(" --> ")[0] end_time = subtitle_items[subtitle_index][1].split(" --> ")[1] next_subtitle_index = subtitle_index + 1 while next_subtitle_index < len(subtitle_items): next_subtitle = subtitle_items[next_subtitle_index][2].strip() if similarity( script_line, combined_subtitle + " " + next_subtitle ) > similarity(script_line, combined_subtitle): combined_subtitle += " " + next_subtitle end_time = subtitle_items[next_subtitle_index][1].split(" --> ")[1] next_subtitle_index += 1 else: break if similarity(script_line, combined_subtitle) > 0.8: logger.warning( f"Merged/Corrected - Script: {script_line}, Subtitle: {combined_subtitle}" ) new_subtitle_items.append( ( len(new_subtitle_items) + 1, f"{start_time} --> {end_time}", script_line, ) ) corrected = True else: logger.warning( f"Mismatch - Script: {script_line}, Subtitle: {combined_subtitle}" ) new_subtitle_items.append( ( len(new_subtitle_items) + 1, f"{start_time} --> {end_time}", script_line, ) ) corrected = True script_index += 1 subtitle_index = next_subtitle_index # 处理剩余的脚本行 while script_index < len(script_lines): logger.warning(f"Extra script line: {script_lines[script_index]}") if subtitle_index < len(subtitle_items): new_subtitle_items.append( ( len(new_subtitle_items) + 1, subtitle_items[subtitle_index][1], script_lines[script_index], ) ) subtitle_index += 1 else: new_subtitle_items.append( ( len(new_subtitle_items) + 1, "00:00:00,000 --> 00:00:00,000", script_lines[script_index], ) ) script_index += 1 corrected = True if corrected: with open(subtitle_file, "w", encoding="utf-8") as fd: for i, item in enumerate(new_subtitle_items): fd.write(f"{i + 1}\n{item[1]}\n{item[2]}\n\n") logger.info("Subtitle corrected") else: logger.success("Subtitle is correct") def create_with_gemini(audio_file: str, subtitle_file: str = "", api_key: Optional[str] = None) -> Optional[str]: if not api_key: logger.error("Gemini API key is not provided") return None genai.configure(api_key=api_key) logger.info(f"开始使用Gemini模型处理音频文件: {audio_file}") model = genai.GenerativeModel(model_name="gemini-1.5-flash") prompt = "生成这段语音的转录文本。请以SRT格式输出,包含时间戳。" try: with open(audio_file, "rb") as f: audio_data = f.read() response = model.generate_content([prompt, audio_data]) transcript = response.text if not subtitle_file: subtitle_file = f"{audio_file}.srt" with open(subtitle_file, "w", encoding="utf-8") as f: f.write(transcript) logger.info(f"Gemini生成的字幕文件已保存: {subtitle_file}") return subtitle_file except Exception as e: logger.error(f"使用Gemini处理音频时出错: {e}") return None def extract_audio_and_create_subtitle(video_file: str, subtitle_file: str = "") -> Optional[str]: """ 从视频文件中提取音频并生成字幕文件。 参数: - video_file: MP4视频文件的路径 - subtitle_file: 输出字幕文件的路径(可选)。如果未提供,将根据视频文件名自动生成。 返回: - str: 生成的字幕文件路径 - None: 如果处理过程中出现错误 """ try: # 获取视频文件所在目录 video_dir = os.path.dirname(video_file) video_name = os.path.splitext(os.path.basename(video_file))[0] # 设置音频文件路径 audio_file = os.path.join(video_dir, f"{video_name}_audio.wav") # 如果未指定字幕文件路径,则自动生成 if not subtitle_file: subtitle_file = os.path.join(video_dir, f"{video_name}.srt") logger.info(f"开始从视频提取音频: {video_file}") # 加载视频文件 video = VideoFileClip(video_file) # 提取音频并保存为WAV格式 logger.info(f"正在提取音频到: {audio_file}") video.audio.write_audiofile(audio_file, codec='pcm_s16le') # 关闭视频文件 video.close() logger.info("音频提取完成,开始生成字幕") # 使用create函数生成字幕 create("/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav", subtitle_file) # 删除临时音频文件 if os.path.exists(audio_file): os.remove(audio_file) logger.info("已清理临时音频文件") return subtitle_file except Exception as e: logger.error(f"处理视频文件时出错: {str(e)}") logger.error(traceback.format_exc()) return None if __name__ == "__main__": task_id = "123456" task_dir = utils.task_dir(task_id) subtitle_file = f"{task_dir}/subtitle_123456.srt" audio_file = "/Users/apple/Desktop/WhisperX-zhuanlu/1_qyn2-2_Vocals.wav" video_file = "/Users/apple/Desktop/home/NarratoAI/storage/temp/merge/qyn2-2-720p.mp4" extract_audio_and_create_subtitle(video_file, subtitle_file) # subtitles = file_to_subtitles(subtitle_file) # print(subtitles) # # script_file = f"{task_dir}/script.json" # # with open(script_file, "r") as f: # # script_content = f.read() # # s = json.loads(script_content) # # script = s.get("script") # # # # correct(subtitle_file, script) # subtitle_file = f"{task_dir}/subtitle111.srt" # create(audio_file, subtitle_file) # # # 使用Gemini模型处理音频 # # gemini_api_key = config.app.get("gemini_api_key") # 请替换为实际的API密钥 # # gemini_subtitle_file = create_with_gemini(audio_file, api_key=gemini_api_key) # # # # if gemini_subtitle_file: # # print(f"Gemini生成的字幕文件: {gemini_subtitle_file}") ================================================ FILE: app/services/subtitle_merger.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : subtitle_merger @Author : viccy @Date : 2025/5/6 下午4:00 ''' import re import os from datetime import datetime, timedelta def parse_time(time_str): """解析时间字符串为timedelta对象""" hours, minutes, seconds_ms = time_str.split(':') seconds, milliseconds = seconds_ms.split(',') td = timedelta( hours=int(hours), minutes=int(minutes), seconds=int(seconds), milliseconds=int(milliseconds) ) return td def format_time(td): """将timedelta对象格式化为SRT时间字符串""" total_seconds = int(td.total_seconds()) hours = total_seconds // 3600 minutes = (total_seconds % 3600) // 60 seconds = total_seconds % 60 milliseconds = td.microseconds // 1000 return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" def parse_edited_time_range(time_range_str): """从editedTimeRange字符串中提取时间范围""" if not time_range_str: return None, None parts = time_range_str.split('-') if len(parts) != 2: return None, None start_time_str, end_time_str = parts # 将HH:MM:SS格式转换为timedelta start_h, start_m, start_s = map(int, start_time_str.split(':')) end_h, end_m, end_s = map(int, end_time_str.split(':')) start_time = timedelta(hours=start_h, minutes=start_m, seconds=start_s) end_time = timedelta(hours=end_h, minutes=end_m, seconds=end_s) return start_time, end_time def merge_subtitle_files(subtitle_items, output_file=None): """ 合并多个SRT字幕文件 参数: subtitle_items: 字典列表,每个字典包含subtitle文件路径和editedTimeRange output_file: 输出文件的路径,如果为None则自动生成 返回: 合并后的字幕文件路径,如果没有有效字幕则返回None """ # 按照editedTimeRange的开始时间排序 sorted_items = sorted(subtitle_items, key=lambda x: parse_edited_time_range(x.get('editedTimeRange', ''))[0] or timedelta()) merged_subtitles = [] subtitle_index = 1 valid_items_count = 0 for item in sorted_items: if not item.get('subtitle') or not os.path.exists(item.get('subtitle')): print(f"跳过项目 {item.get('_id')}:字幕文件不存在或路径为空") continue # 从editedTimeRange获取起始时间偏移 offset_time, _ = parse_edited_time_range(item.get('editedTimeRange', '')) if offset_time is None: print(f"警告: 无法从项目 {item.get('_id')} 的editedTimeRange中提取时间范围,跳过该项") continue try: with open(item['subtitle'], 'r', encoding='utf-8') as file: content = file.read().strip() # 检查文件内容是否为空 if not content: print(f"跳过项目 {item.get('_id')}:字幕文件内容为空") continue valid_items_count += 1 # 解析字幕文件 subtitle_blocks = re.split(r'\n\s*\n', content) for block in subtitle_blocks: lines = block.strip().split('\n') if len(lines) < 3: # 确保块有足够的行数 continue # 解析时间轴行 time_line = lines[1] time_parts = time_line.split(' --> ') if len(time_parts) != 2: continue start_time = parse_time(time_parts[0]) end_time = parse_time(time_parts[1]) # 应用时间偏移 adjusted_start_time = start_time + offset_time adjusted_end_time = end_time + offset_time # 重建字幕块 adjusted_time_line = f"{format_time(adjusted_start_time)} --> {format_time(adjusted_end_time)}" text_lines = lines[2:] new_block = [ str(subtitle_index), adjusted_time_line, *text_lines ] merged_subtitles.append('\n'.join(new_block)) subtitle_index += 1 except Exception as e: print(f"处理项目 {item.get('_id')} 的字幕文件时出错: {str(e)}") continue # 检查是否有有效的字幕内容 if not merged_subtitles: print(f"警告: 没有找到有效的字幕内容,共检查了 {len(subtitle_items)} 个项目,其中 {valid_items_count} 个有有效文件") return None # 确定输出文件路径 if output_file is None: # 找到第一个有效的字幕文件来确定目录 valid_item = None for item in sorted_items: if item.get('subtitle') and os.path.exists(item.get('subtitle')): valid_item = item break if not valid_item: print("错误: 无法确定输出目录,没有找到有效的字幕文件") return None dir_path = os.path.dirname(valid_item['subtitle']) first_start = parse_edited_time_range(sorted_items[0]['editedTimeRange'])[0] last_end = parse_edited_time_range(sorted_items[-1]['editedTimeRange'])[1] if first_start and last_end: first_start_h, first_start_m, first_start_s = int(first_start.seconds // 3600), int((first_start.seconds % 3600) // 60), int(first_start.seconds % 60) last_end_h, last_end_m, last_end_s = int(last_end.seconds // 3600), int((last_end.seconds % 3600) // 60), int(last_end.seconds % 60) first_start_str = f"{first_start_h:02d}_{first_start_m:02d}_{first_start_s:02d}" last_end_str = f"{last_end_h:02d}_{last_end_m:02d}_{last_end_s:02d}" output_file = os.path.join(dir_path, f"merged_subtitle_{first_start_str}-{last_end_str}.srt") else: output_file = os.path.join(dir_path, f"merged_subtitle.srt") # 合并所有字幕块 merged_content = '\n\n'.join(merged_subtitles) # 写入合并后的内容 try: with open(output_file, 'w', encoding='utf-8') as file: file.write(merged_content) print(f"字幕文件合并成功: {output_file},包含 {len(merged_subtitles)} 个字幕条目") return output_file except Exception as e: print(f"写入字幕文件失败: {str(e)}") return None if __name__ == '__main__': # 测试数据 test_data = [ {'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', 'timestamp': '00:00:00-00:01:15', 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', 'OST': 0, '_id': 1, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 'sourceTimeRange': '00:00:00-00:00:26', 'duration': 26, 'editedTimeRange': '00:00:00-00:00:26' }, {'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', 'timestamp': '00:01:15-00:04:40', 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', 'OST': 0, '_id': 2, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 'sourceTimeRange': '00:01:15-00:01:29', 'duration': 14, 'editedTimeRange': '00:00:26-00:00:40' }, {'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', 'timestamp': '00:04:58-00:05:45', 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', 'OST': 0, '_id': 4, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 'sourceTimeRange': '00:04:58-00:05:20', 'duration': 22, 'editedTimeRange': '00:00:57-00:01:19' }, {'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'timestamp': '00:05:45-00:06:00', 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'OST': 0, '_id': 5, 'audio': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3', 'subtitle': '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt', 'sourceTimeRange': '00:05:45-00:05:53', 'duration': 8, 'editedTimeRange': '00:01:19-00:01:27' } ] output_file = merge_subtitle_files(test_data) print(f"字幕文件已合并至: {output_file}") ================================================ FILE: app/services/subtitle_text.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Subtitle text utilities. This module provides a shared, cross-platform way to read and normalize subtitle content. Both Short Drama Editing (混剪) and Short Drama Narration (解说) should consume subtitle content through this module to avoid platform-specific parsing issues (e.g. Windows UTF-16 SRT, timestamp separators, etc.). """ from __future__ import annotations import os import re from dataclasses import dataclass from typing import Iterable, Optional _SRT_TIME_RE = re.compile( r"\b\d{2}:\d{2}:\d{2}(?:[,.]\d{3})?\s*-->\s*\d{2}:\d{2}:\d{2}(?:[,.]\d{3})?\b" ) _SRT_MS_DOT_RE = re.compile(r"(\b\d{2}:\d{2}:\d{2})\.(\d{3}\b)") @dataclass(frozen=True) class DecodedSubtitle: text: str encoding: str def has_timecodes(text: str) -> bool: """Return True if the subtitle text contains at least one SRT timecode.""" if not text: return False return _SRT_TIME_RE.search(text) is not None def normalize_subtitle_text(text: str) -> str: """ Normalize subtitle text to improve cross-platform reliability. - Unifies line endings to LF - Removes BOM and NUL bytes - Normalizes millisecond separators from '.' to ',' in timecodes """ if text is None: return "" normalized = str(text) # Strip BOM. if normalized.startswith("\ufeff"): normalized = normalized.lstrip("\ufeff") # Remove NUL bytes (common when UTF-16 is mis-decoded elsewhere). normalized = normalized.replace("\x00", "") # Normalize newlines. normalized = normalized.replace("\r\n", "\n").replace("\r", "\n") # Normalize timestamp millisecond separator: 00:00:01.000 -> 00:00:01,000 normalized = _SRT_MS_DOT_RE.sub(r"\1,\2", normalized) return normalized.strip() def decode_subtitle_bytes( data: bytes, *, encodings: Optional[Iterable[str]] = None, ) -> DecodedSubtitle: """ Decode subtitle bytes using a small set of common encodings. Preference is given to decodings that yield detectable SRT timecodes. """ if data is None: return DecodedSubtitle(text="", encoding="utf-8") candidates = list(encodings) if encodings else [ "utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be", "gbk", "gb2312", ] decoded_results: list[DecodedSubtitle] = [] for encoding in candidates: try: decoded_text = data.decode(encoding) except UnicodeDecodeError: continue decoded_results.append( DecodedSubtitle(text=normalize_subtitle_text(decoded_text), encoding=encoding) ) # Fast path: if we already see timecodes, keep the first such decode. if has_timecodes(decoded_results[-1].text): return decoded_results[-1] if decoded_results: # Fall back to the first successful decoding. return decoded_results[0] # Last resort: replace undecodable bytes. return DecodedSubtitle(text=normalize_subtitle_text(data.decode("utf-8", errors="replace")), encoding="utf-8") def read_subtitle_text(file_path: str) -> DecodedSubtitle: """Read subtitle file from disk, decode and normalize its text.""" if not file_path or not str(file_path).strip(): return DecodedSubtitle(text="", encoding="utf-8") normalized_path = os.path.abspath(str(file_path)) with open(normalized_path, "rb") as f: data = f.read() return decode_subtitle_bytes(data) ================================================ FILE: app/services/task.py ================================================ import math import json import os.path import re import traceback from os import path from loguru import logger from app.config import config from app.config.audio_config import AudioConfig, get_recommended_volumes_for_content from app.models import const from app.models.schema import VideoClipParams from app.services import (voice, audio_merger, subtitle_merger, clip_video, merger_video, update_script, generate_video) from app.services import state as sm from app.utils import utils def start_subclip(task_id: str, params: VideoClipParams, subclip_path_videos: dict = None): """ 后台任务(统一视频裁剪处理)- 优化版本 实施基于OST类型的统一视频裁剪策略,消除双重裁剪问题: - OST=0: 根据TTS音频时长动态裁剪,移除原声 - OST=1: 严格按照脚本timestamp精确裁剪,保持原声 - OST=2: 根据TTS音频时长动态裁剪,保持原声 Args: task_id: 任务ID params: 视频参数 subclip_path_videos: 视频片段路径(可选,仅作为备用方案) """ global merged_audio_path, merged_subtitle_path logger.info(f"\n\n## 开始任务: {task_id}") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0) """ 1. 加载剪辑脚本 """ logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) if path.exists(video_script_path): try: with open(video_script_path, "r", encoding="utf-8") as f: list_script = json.load(f) video_list = [i['narration'] for i in list_script] video_ost = [i['OST'] for i in list_script] time_list = [i['timestamp'] for i in list_script] video_script = " ".join(video_list) logger.debug(f"解说完整脚本: \n{video_script}") logger.debug(f"解说 OST 列表: \n{video_ost}") logger.debug(f"解说时间戳列表: \n{time_list}") except Exception as e: logger.error(f"无法读取视频json脚本,请检查脚本格式是否正确") raise ValueError("无法读取视频json脚本,请检查脚本格式是否正确") else: logger.error(f"解说脚本文件不存在: {video_script_path},请先点击【保存脚本】按钮保存脚本后再生成视频") raise ValueError("解说脚本文件不存在!请先点击【保存脚本】按钮保存脚本后再生成视频。") """ 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] ] logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}") tts_results = voice.tts_multiple( task_id=task_id, list_script=tts_segments, # 只传入需要TTS的片段 tts_engine=params.tts_engine, voice_name=params.voice_name, voice_rate=params.voice_rate, voice_pitch=params.voice_pitch, ) sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) # """ # 3. (可选) 使用 whisper 生成字幕 # """ # if merged_subtitle_path is None: # if audio_files: # merged_subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt") # subtitle_provider = config.app.get("subtitle_provider", "").strip().lower() # logger.info(f"\n\n使用 {subtitle_provider} 生成字幕") # # subtitle.create( # audio_file=merged_audio_path, # subtitle_file=merged_subtitle_path, # ) # subtitle_lines = subtitle.file_to_subtitles(merged_subtitle_path) # if not subtitle_lines: # logger.warning(f"字幕文件无效: {merged_subtitle_path}") # # sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40) """ 3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略 """ logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, script_list=list_script, tts_results=tts_results ) # 更新 list_script 中的时间戳和路径信息 tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results} subclip_clip_result = { tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results } new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result) logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) """ 4. 合并音频和字幕 """ logger.info("\n\n## 4. 合并音频和字幕") total_duration = sum([script["duration"] for script in new_script_list]) if tts_segments: try: # 合并音频文件 merged_audio_path = audio_merger.merge_audio_files( task_id=task_id, total_duration=total_duration, list_script=new_script_list ) logger.info(f"音频文件合并成功->{merged_audio_path}") # 合并字幕文件 merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) if merged_subtitle_path: logger.info(f"字幕文件合并成功->{merged_subtitle_path}") else: logger.warning("没有有效的字幕内容,将生成无字幕视频") merged_subtitle_path = "" except Exception as e: logger.error(f"合并音频/字幕文件失败: {str(e)}") # 确保即使合并失败也有默认值 if 'merged_audio_path' not in locals(): merged_audio_path = "" if 'merged_subtitle_path' not in locals(): merged_subtitle_path = "" else: logger.warning("没有需要合并的音频/字幕") merged_audio_path = "" merged_subtitle_path = "" """ 5. 合并视频 """ final_video_paths = [] combined_video_paths = [] combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") # 使用统一裁剪后的视频片段 video_clips = [] for new_script in new_script_list: video_path = new_script.get('video') if video_path and os.path.exists(video_path): video_clips.append(video_path) else: logger.warning(f"片段 {new_script.get('_id')} 的视频文件不存在或未生成: {video_path}") # 如果统一裁剪失败,尝试使用备用方案(如果提供了subclip_path_videos) if subclip_path_videos and new_script.get('_id') in subclip_path_videos: backup_video = subclip_path_videos[new_script.get('_id')] if os.path.exists(backup_video): video_clips.append(backup_video) logger.info(f"使用备用视频: {backup_video}") else: logger.error(f"备用视频也不存在: {backup_video}") else: logger.error(f"无法找到片段 {new_script.get('_id')} 的视频文件") logger.info(f"准备合并 {len(video_clips)} 个视频片段") merger_video.combine_clip_videos( output_video_path=combined_video_path, video_paths=video_clips, video_ost_list=video_ost, video_aspect=params.video_aspect, threads=params.n_threads ) sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80) """ 6. 合并字幕/BGM/配音/视频 """ output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}") # bgm_path = '/Users/apple/Desktop/home/NarratoAI/resource/songs/bgm.mp3' bgm_path = utils.get_bgm_file() # 获取优化的音量配置 optimized_volumes = get_recommended_volumes_for_content('mixed') # 检查是否有OST=1的原声片段,如果有,则保持原声音量为1.0不变 has_original_audio_segments = any(segment['OST'] == 1 for segment in list_script) # 应用用户设置和优化建议的组合 # 如果用户设置了非默认值,优先使用用户设置 final_tts_volume = params.tts_volume if hasattr(params, 'tts_volume') and params.tts_volume != 1.0 else optimized_volumes['tts_volume'] # 关键修复:如果有原声片段,保持原声音量为1.0,确保与原视频音量一致 if has_original_audio_segments: final_original_volume = 1.0 # 保持原声音量不变 logger.info("检测到原声片段,原声音量设置为1.0以保持与原视频一致") else: final_original_volume = params.original_volume if hasattr(params, 'original_volume') and params.original_volume != 0.7 else optimized_volumes['original_volume'] final_bgm_volume = params.bgm_volume if hasattr(params, 'bgm_volume') and params.bgm_volume != 0.3 else optimized_volumes['bgm_volume'] logger.info(f"音量配置 - TTS: {final_tts_volume}, 原声: {final_original_volume}, BGM: {final_bgm_volume}") # 调用示例 options = { 'voice_volume': final_tts_volume, # 配音音量(优化后) 'bgm_volume': final_bgm_volume, # 背景音乐音量(优化后) 'original_audio_volume': final_original_volume, # 视频原声音量(优化后) 'keep_original_audio': True, # 是否保留原声 'subtitle_enabled': params.subtitle_enabled, # 是否启用字幕 - 修复字幕开关bug 'subtitle_font': params.font_name, # 这里使用相对字体路径,会自动在 font_dir() 目录下查找 'subtitle_font_size': params.font_size, 'subtitle_color': params.text_fore_color, 'subtitle_bg_color': None, # 直接使用None表示透明背景 'subtitle_position': params.subtitle_position, 'custom_position': params.custom_position, 'threads': params.n_threads } generate_video.merge_materials( video_path=combined_video_path, audio_path=merged_audio_path, subtitle_path=merged_subtitle_path, bgm_path=bgm_path, output_path=output_video_path, options=options ) final_video_paths.append(output_video_path) combined_video_paths.append(combined_video_path) logger.success(f"任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.") kwargs = { "videos": final_video_paths, "combined_videos": combined_video_paths } sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs) return kwargs def start_subclip_unified(task_id: str, params: VideoClipParams): """ 统一视频裁剪处理函数 - 完全基于OST类型的新实现 这是优化后的版本,完全移除了对预裁剪视频的依赖, 实现真正的统一裁剪策略。 Args: task_id: 任务ID params: 视频参数 """ global merged_audio_path, merged_subtitle_path logger.info(f"\n\n## 开始统一视频处理任务: {task_id}") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=0) """ 1. 加载剪辑脚本 """ logger.info("\n\n## 1. 加载视频脚本") video_script_path = path.join(params.video_clip_json_path) if path.exists(video_script_path): try: with open(video_script_path, "r", encoding="utf-8") as f: list_script = json.load(f) video_list = [i['narration'] for i in list_script] video_ost = [i['OST'] for i in list_script] time_list = [i['timestamp'] for i in list_script] video_script = " ".join(video_list) logger.debug(f"解说完整脚本: \n{video_script}") logger.debug(f"解说 OST 列表: \n{video_ost}") logger.debug(f"解说时间戳列表: \n{time_list}") except Exception as e: logger.error(f"无法读取视频json脚本,请检查脚本格式是否正确") raise ValueError("无法读取视频json脚本,请检查脚本格式是否正确") else: logger.error(f"解说脚本文件不存在: {video_script_path},请先点击【保存脚本】按钮保存脚本后再生成视频") raise ValueError("解说脚本文件不存在!请先点击【保存脚本】按钮保存脚本后再生成视频。") """ 2. 使用 TTS 生成音频素材 """ logger.info("\n\n## 2. 根据OST设置生成音频列表") # 只为OST=0 or 2的判断生成音频, OST=0 仅保留解说 OST=2 保留解说和原声 tts_segments = [ segment for segment in list_script if segment['OST'] in [0, 2] ] logger.debug(f"需要生成TTS的片段数: {len(tts_segments)}") tts_results = voice.tts_multiple( task_id=task_id, list_script=tts_segments, # 只传入需要TTS的片段 tts_engine=params.tts_engine, voice_name=params.voice_name, voice_rate=params.voice_rate, voice_pitch=params.voice_pitch, ) sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20) """ 3. 统一视频裁剪 - 基于OST类型的差异化裁剪策略 """ logger.info("\n\n## 3. 统一视频裁剪(基于OST类型)") # 使用新的统一裁剪策略 video_clip_result = clip_video.clip_video_unified( video_origin_path=params.video_origin_path, script_list=list_script, tts_results=tts_results ) # 更新 list_script 中的时间戳和路径信息 tts_clip_result = {tts_result['_id']: tts_result['audio_file'] for tts_result in tts_results} subclip_clip_result = { tts_result['_id']: tts_result['subtitle_file'] for tts_result in tts_results } new_script_list = update_script.update_script_timestamps(list_script, video_clip_result, tts_clip_result, subclip_clip_result) logger.info(f"统一裁剪完成,处理了 {len(video_clip_result)} 个视频片段") sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=60) """ 4. 合并音频和字幕 """ logger.info("\n\n## 4. 合并音频和字幕") total_duration = sum([script["duration"] for script in new_script_list]) if tts_segments: try: # 合并音频文件 merged_audio_path = audio_merger.merge_audio_files( task_id=task_id, total_duration=total_duration, list_script=new_script_list ) logger.info(f"音频文件合并成功->{merged_audio_path}") # 合并字幕文件 merged_subtitle_path = subtitle_merger.merge_subtitle_files(new_script_list) if merged_subtitle_path: logger.info(f"字幕文件合并成功->{merged_subtitle_path}") else: logger.warning("没有有效的字幕内容,将生成无字幕视频") merged_subtitle_path = "" except Exception as e: logger.error(f"合并音频/字幕文件失败: {str(e)}") # 确保即使合并失败也有默认值 if 'merged_audio_path' not in locals(): merged_audio_path = "" if 'merged_subtitle_path' not in locals(): merged_subtitle_path = "" else: logger.warning("没有需要合并的音频/字幕") merged_audio_path = "" merged_subtitle_path = "" """ 5. 合并视频 """ final_video_paths = [] combined_video_paths = [] combined_video_path = path.join(utils.task_dir(task_id), f"merger.mp4") logger.info(f"\n\n## 5. 合并视频: => {combined_video_path}") # 使用统一裁剪后的视频片段 video_clips = [] for new_script in new_script_list: video_path = new_script.get('video') if video_path and os.path.exists(video_path): video_clips.append(video_path) else: logger.error(f"片段 {new_script.get('_id')} 的视频文件不存在: {video_path}") logger.info(f"准备合并 {len(video_clips)} 个视频片段") merger_video.combine_clip_videos( output_video_path=combined_video_path, video_paths=video_clips, video_ost_list=video_ost, video_aspect=params.video_aspect, threads=params.n_threads ) sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=80) """ 6. 合并字幕/BGM/配音/视频 """ output_video_path = path.join(utils.task_dir(task_id), f"combined.mp4") logger.info(f"\n\n## 6. 最后一步: 合并字幕/BGM/配音/视频 -> {output_video_path}") bgm_path = utils.get_bgm_file() # 获取优化的音量配置 optimized_volumes = get_recommended_volumes_for_content('mixed') # 检查是否有OST=1的原声片段,如果有,则保持原声音量为1.0不变 has_original_audio_segments = any(segment['OST'] == 1 for segment in list_script) # 应用用户设置和优化建议的组合 final_tts_volume = params.tts_volume if hasattr(params, 'tts_volume') and params.tts_volume != 1.0 else optimized_volumes['tts_volume'] # 关键修复:如果有原声片段,保持原声音量为1.0,确保与原视频音量一致 if has_original_audio_segments: final_original_volume = 1.0 # 保持原声音量不变 logger.info("检测到原声片段,原声音量设置为1.0以保持与原视频一致") else: final_original_volume = params.original_volume if hasattr(params, 'original_volume') and params.original_volume != 0.7 else optimized_volumes['original_volume'] final_bgm_volume = params.bgm_volume if hasattr(params, 'bgm_volume') and params.bgm_volume != 0.3 else optimized_volumes['bgm_volume'] logger.info(f"音量配置 - TTS: {final_tts_volume}, 原声: {final_original_volume}, BGM: {final_bgm_volume}") # 调用示例 options = { 'voice_volume': final_tts_volume, 'bgm_volume': final_bgm_volume, 'original_audio_volume': final_original_volume, 'keep_original_audio': True, 'subtitle_enabled': params.subtitle_enabled, 'subtitle_font': params.font_name, 'subtitle_font_size': params.font_size, 'subtitle_color': params.text_fore_color, 'subtitle_bg_color': None, 'subtitle_position': params.subtitle_position, 'custom_position': params.custom_position, 'threads': params.n_threads } generate_video.merge_materials( video_path=combined_video_path, audio_path=merged_audio_path, subtitle_path=merged_subtitle_path, bgm_path=bgm_path, output_path=output_video_path, options=options ) final_video_paths.append(output_video_path) combined_video_paths.append(combined_video_path) logger.success(f"统一处理任务 {task_id} 已完成, 生成 {len(final_video_paths)} 个视频.") kwargs = { "videos": final_video_paths, "combined_videos": combined_video_paths } sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs) return kwargs def validate_params(video_path, audio_path, output_file, params): """ 验证输入参数 Args: video_path: 视频文件路径 audio_path: 音频文件路径(可以为空字符串) output_file: 输出文件路径 params: 视频参数 Raises: FileNotFoundError: 文件不存在时抛出 ValueError: 参数无效时抛出 """ if not video_path: raise ValueError("视频路径不能为空") if not os.path.exists(video_path): raise FileNotFoundError(f"视频文件不存在: {video_path}") # 如果提供了音频路径,则验证文件是否存在 if audio_path and not os.path.exists(audio_path): raise FileNotFoundError(f"音频文件不存在: {audio_path}") if not output_file: raise ValueError("输出文件路径不能为空") # 确保输出目录存在 output_dir = os.path.dirname(output_file) if not os.path.exists(output_dir): os.makedirs(output_dir) if not params: raise ValueError("视频参数不能为空") if __name__ == "__main__": task_id = "demo" # 提前裁剪是为了方便检查视频 subclip_path_videos = { 1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-05-390@00-00-57-980.mp4', 2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-00-28-900@00-00-43-700.mp4', 3: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-01-17-840@00-01-27-600.mp4', 4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-02-35-460@00-02-52-380.mp4', 5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/113343d127b5a09d0bf84b68bd1b3b97/vid_00-06-59-520@00-07-29-500.mp4', } params = VideoClipParams( video_clip_json_path="/Users/apple/Desktop/home/NarratoAI/resource/scripts/2025-0507-223311.json", video_origin_path="/Users/apple/Desktop/home/NarratoAI/resource/videos/merged_video_4938.mp4", ) start_subclip(task_id, params, subclip_path_videos) ================================================ FILE: app/services/update_script.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : update_script @Author : Viccy同学 @Date : 2025/5/6 下午11:00 ''' import re import os from typing import Dict, List, Any, Tuple, Union def extract_timestamp_from_video_path(video_path: str) -> str: """ 从视频文件路径中提取时间戳 Args: video_path: 视频文件路径 Returns: 提取出的时间戳,格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' """ # 使用正则表达式从文件名中提取时间戳 filename = os.path.basename(video_path) # 匹配新格式: vid_00-00-00-000@00-00-20-250.mp4 match_new = re.search(r'vid_(\d{2})-(\d{2})-(\d{2})-(\d{3})@(\d{2})-(\d{2})-(\d{2})-(\d{3})\.mp4', filename) if match_new: # 提取并格式化时间戳(包含毫秒) start_h, start_m, start_s, start_ms = match_new.group(1), match_new.group(2), match_new.group(3), match_new.group(4) end_h, end_m, end_s, end_ms = match_new.group(5), match_new.group(6), match_new.group(7), match_new.group(8) return f"{start_h}:{start_m}:{start_s},{start_ms}-{end_h}:{end_m}:{end_s},{end_ms}" # 匹配旧格式: vid-00-00-00-00-00-00.mp4 match_old = re.search(r'vid-(\d{2}-\d{2}-\d{2})-(\d{2}-\d{2}-\d{2})\.mp4', filename) if match_old: # 提取并格式化时间戳 start_time = match_old.group(1).replace('-', ':') end_time = match_old.group(2).replace('-', ':') return f"{start_time}-{end_time}" return "" def calculate_duration(timestamp: str) -> float: """ 计算时间戳范围的持续时间(秒) Args: timestamp: 格式为 'HH:MM:SS-HH:MM:SS' 或 'HH:MM:SS,sss-HH:MM:SS,sss' 的时间戳 Returns: 持续时间(秒) """ try: start_time, end_time = timestamp.split('-') # 处理毫秒部分 if ',' in start_time: start_parts = start_time.split(',') start_time_parts = start_parts[0].split(':') start_ms = float('0.' + start_parts[1]) if len(start_parts) > 1 else 0 start_h, start_m, start_s = map(int, start_time_parts) else: start_h, start_m, start_s = map(int, start_time.split(':')) start_ms = 0 if ',' in end_time: end_parts = end_time.split(',') end_time_parts = end_parts[0].split(':') end_ms = float('0.' + end_parts[1]) if len(end_parts) > 1 else 0 end_h, end_m, end_s = map(int, end_time_parts) else: end_h, end_m, end_s = map(int, end_time.split(':')) end_ms = 0 # 转换为秒 start_seconds = start_h * 3600 + start_m * 60 + start_s + start_ms end_seconds = end_h * 3600 + end_m * 60 + end_s + end_ms # 计算时间差(秒) return round(end_seconds - start_seconds, 2) except (ValueError, AttributeError): return 0.0 def update_script_timestamps( script_list: List[Dict[str, Any]], video_result: Dict[Union[str, int], str], audio_result: Dict[Union[str, int], str] = None, subtitle_result: Dict[Union[str, int], str] = None, calculate_edited_timerange: bool = True ) -> List[Dict[str, Any]]: """ 根据 video_result 中的视频文件更新 script_list 中的时间戳,添加持续时间, 并根据 audio_result 添加音频路径,根据 subtitle_result 添加字幕路径 Args: script_list: 原始脚本列表 video_result: 视频结果字典,键为原时间戳或_id,值为视频文件路径 audio_result: 音频结果字典,键为原时间戳或_id,值为音频文件路径 subtitle_result: 字幕结果字典,键为原时间戳或_id,值为字幕文件路径 calculate_edited_timerange: 是否计算并添加成品视频中的时间范围 Returns: 更新后的脚本列表 """ # 创建副本,避免修改原始数据 updated_script = [] # 建立ID和时间戳到视频路径和新时间戳的映射 id_timestamp_mapping = {} for key, video_path in video_result.items(): new_timestamp = extract_timestamp_from_video_path(video_path) if new_timestamp: id_timestamp_mapping[key] = { 'new_timestamp': new_timestamp, 'video_path': video_path } # 计算累积时长,用于生成成品视频中的时间范围 accumulated_duration = 0.0 # 更新脚本中的时间戳 for item in script_list: item_copy = item.copy() item_id = item_copy.get('_id') orig_timestamp = item_copy.get('timestamp', '') # 初始化音频和字幕路径为空字符串 item_copy['audio'] = "" item_copy['subtitle'] = "" item_copy['video'] = "" # 初始化视频路径为空字符串 # 如果提供了音频结果字典且ID存在于音频结果中,直接使用对应的音频路径 if audio_result: if item_id and item_id in audio_result: item_copy['audio'] = audio_result[item_id] elif orig_timestamp in audio_result: item_copy['audio'] = audio_result[orig_timestamp] # 如果提供了字幕结果字典且ID存在于字幕结果中,直接使用对应的字幕路径 if subtitle_result: if item_id and item_id in subtitle_result: item_copy['subtitle'] = subtitle_result[item_id] elif orig_timestamp in subtitle_result: item_copy['subtitle'] = subtitle_result[orig_timestamp] # 添加视频路径 if item_id and item_id in video_result: item_copy['video'] = video_result[item_id] elif orig_timestamp in video_result: item_copy['video'] = video_result[orig_timestamp] # 更新时间戳和计算持续时间 current_duration = 0.0 if item_id and item_id in id_timestamp_mapping: # 根据ID找到对应的新时间戳 item_copy['sourceTimeRange'] = id_timestamp_mapping[item_id]['new_timestamp'] current_duration = calculate_duration(item_copy['sourceTimeRange']) item_copy['duration'] = current_duration elif orig_timestamp in id_timestamp_mapping: # 根据原始时间戳找到对应的新时间戳 item_copy['sourceTimeRange'] = id_timestamp_mapping[orig_timestamp]['new_timestamp'] current_duration = calculate_duration(item_copy['sourceTimeRange']) item_copy['duration'] = current_duration elif orig_timestamp: # 对于未更新的时间戳,也计算并添加持续时间 item_copy['sourceTimeRange'] = orig_timestamp current_duration = calculate_duration(orig_timestamp) item_copy['duration'] = current_duration # 计算片段在成品视频中的时间范围 if calculate_edited_timerange and current_duration > 0: start_time_seconds = accumulated_duration end_time_seconds = accumulated_duration + current_duration # 将秒数转换为 HH:MM:SS 格式 start_h = int(start_time_seconds // 3600) start_m = int((start_time_seconds % 3600) // 60) start_s = int(start_time_seconds % 60) end_h = int(end_time_seconds // 3600) end_m = int((end_time_seconds % 3600) // 60) end_s = int(end_time_seconds % 60) item_copy['editedTimeRange'] = f"{start_h:02d}:{start_m:02d}:{start_s:02d}-{end_h:02d}:{end_m:02d}:{end_s:02d}" # 更新累积时长 accumulated_duration = end_time_seconds updated_script.append(item_copy) return updated_script if __name__ == '__main__': list_script = [ { 'picture': '【解说】好的,各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!', 'timestamp': '00:00:00,001-00:01:15,001', 'narration': '好的各位,欢迎回到我的频道!《庆余年 2》刚开播就给了我们一个王炸!范闲在北齐"死"了?这怎么可能!上集片尾那个巨大的悬念,这一集就立刻揭晓了!范闲假死归来,他面临的第一个,也是最大的难关,就是如何面对他最敬爱的,同时也是最可怕的那个人——庆帝!', 'OST': 0, '_id': 1 }, { 'picture': '【解说】上一集我们看到,范闲在北齐遭遇了惊天变故,生死不明!', 'timestamp': '00:01:15,001-00:04:40,001', 'narration': '但我们都知道,他绝不可能就这么轻易退场!第二集一开场,范闲就已经秘密回到了京都。他的生死传闻,可不像我们想象中那样只是小范围流传,而是…', 'OST': 0, '_id': 2 }, { 'picture': '画面切到王启年小心翼翼地向范闲汇报。', 'timestamp': '00:04:41,001-00:04:58,001', 'narration': '我发现大人的死讯不光是在民间,在官场上也它传开了,所以呢,所以啊,可不是什么好事,将来您跟陛下怎么交代,这可是欺君之罪', 'OST': 1, '_id': 3 }, { 'picture': '【解说】"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。', 'timestamp': '00:04:58,001-00:05:45,001', 'narration': '"欺君之罪"!在封建王朝,这可是抄家灭族的大罪!搁一般人,肯定脚底抹油溜之大吉了。但范闲是谁啊?他偏要反其道而行之!他竟然决定,直接去见庆帝!冒着天大的风险,用"假死"这个事实去赌庆帝的态度!', 'OST': 0, '_id': 4 }, { 'picture': '【解说】但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'timestamp': '00:05:45,001-00:06:00,001', 'narration': '但想见庆帝,哪有那么容易?范闲艺高人胆大,竟然选择了最激进的方式——闯宫!', 'OST': 0, '_id': 5 }, { 'picture': '画面切换到范闲蒙面闯入皇宫,被侍卫包围的场景。', 'timestamp': '00:06:00,001-00:06:03,001', 'narration': '抓刺客', 'OST': 1, '_id': 6 }] video_res = { 1: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-00-000@00-00-20-250.mp4', 2: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-00-30-000@00-00-48-950.mp4', 4: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-00-000@00-01-15-688.mp4', 5: '/Users/apple/Desktop/home/NarratoAI/storage/temp/clip_video/fc3db5844d1ba7d7d838be52c0dac1bd/vid_00-01-30-000@00-01-49-512.mp4'} audio_res = { 1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_00_00-00_01_15.mp3', 2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_01_15-00_04_40.mp3', 4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_04_58-00_05_45.mp3', 5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/audio_00_05_45-00_06_00.mp3'} sub_res = { 1: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_00_00-00_01_15.srt', 2: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_01_15-00_04_40.srt', 4: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_04_58-00_05_45.srt', 5: '/Users/apple/Desktop/home/NarratoAI/storage/tasks/qyn2-2-demo/subtitle_00_05_45-00_06_00.srt'} # 更新并打印结果 updated_list_script = update_script_timestamps(list_script, video_res, audio_res, sub_res) for item in updated_list_script: print( f"ID: {item['_id']} | Picture: {item['picture'][:20]}... | Timestamp: {item['timestamp']} | " + f"SourceTimeRange: {item['sourceTimeRange']} | EditedTimeRange: {item.get('editedTimeRange', '')} | " + f"Duration: {item['duration']} 秒 | Audio: {item['audio']} | Video: {item['video']} | Subtitle: {item['subtitle']}") ================================================ FILE: app/services/upload_validation.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project: NarratoAI @File : upload_validation.py @Author : AI Assistant @Date : 2025/12/25 @Desc : 统一的文件上传验证工具,用于短剧混剪和短剧解说功能 """ import os from typing import Optional, Tuple class InputValidationError(ValueError): """当必需的用户输入(路径/内容)缺失或无效时抛出""" pass def ensure_existing_file( file_path: str, *, label: str = "文件", allowed_exts: Optional[Tuple[str, ...]] = None, ) -> str: """ 验证文件路径是否存在且有效 Args: file_path: 待验证的文件路径 label: 文件类型标签(用于错误提示) allowed_exts: 允许的文件扩展名元组(如 ('.srt', '.txt')) Returns: str: 规范化后的绝对路径 Raises: InputValidationError: 文件路径无效、文件不存在或格式不支持 """ if not file_path or not str(file_path).strip(): raise InputValidationError(f"{label}不能为空,请先上传{label}") normalized = os.path.abspath(str(file_path)) if not os.path.exists(normalized): raise InputValidationError(f"{label}文件不存在: {normalized}") if not os.path.isfile(normalized): raise InputValidationError(f"{label}不是有效文件: {normalized}") if allowed_exts: ext = os.path.splitext(normalized)[1].lower() allowed = tuple(e.lower() for e in allowed_exts) if ext not in allowed: raise InputValidationError( f"{label}格式不支持: {ext},仅支持: {', '.join(allowed_exts)}" ) return normalized def resolve_subtitle_input( *, subtitle_content: Optional[str] = None, subtitle_file_path: Optional[str] = None, srt_path: Optional[str] = None, ) -> Tuple[Optional[str], Optional[str]]: """ 解析字幕输入源,确保只有一个有效来源 Args: subtitle_content: 字幕文本内容 subtitle_file_path: 字幕文件路径(推荐) srt_path: 字幕文件路径(向后兼容SDP旧参数) Returns: Tuple[Optional[str], Optional[str]]: (字幕内容, 字幕文件路径) - 返回 (content, None) 表示使用内容输入 - 返回 (None, file_path) 表示使用文件路径输入 Raises: InputValidationError: 未提供输入或同时提供多个输入 """ file_path = subtitle_file_path or srt_path has_content = subtitle_content is not None and bool(str(subtitle_content).strip()) has_file = file_path is not None and bool(str(file_path).strip()) if has_content and has_file: raise InputValidationError("只能提供字幕内容或字幕文件路径之一") if not has_content and not has_file: raise InputValidationError("必须提供字幕内容或字幕文件路径") if has_content: content = str(subtitle_content) if not content.strip(): raise InputValidationError("字幕内容为空") return content, None resolved_path = ensure_existing_file( str(file_path), label="字幕", allowed_exts=(".srt",), ) return None, resolved_path ================================================ FILE: app/services/video.py ================================================ import traceback # import pysrt from typing import Optional from typing import List from loguru import logger from moviepy import * from PIL import ImageFont from contextlib import contextmanager from moviepy import ( VideoFileClip, AudioFileClip, TextClip, CompositeVideoClip, CompositeAudioClip ) from app.models.schema import VideoAspect, SubtitlePosition def wrap_text(text, max_width, font, fontsize=60): """ 文本自动换行处理 Args: text: 待处理的文本 max_width: 最大宽度 font: 字体文件路径 fontsize: 字体大小 Returns: tuple: (换行后的文本, 文本高度) """ # 创建字体对象 font = ImageFont.truetype(font, fontsize) def get_text_size(inner_text): inner_text = inner_text.strip() left, top, right, bottom = font.getbbox(inner_text) return right - left, bottom - top width, height = get_text_size(text) if width <= max_width: return text, height logger.debug(f"换行文本, 最大宽度: {max_width}, 文本宽度: {width}, 文本: {text}") processed = True _wrapped_lines_ = [] words = text.split(" ") _txt_ = "" for word in words: _before = _txt_ _txt_ += f"{word} " _width, _height = get_text_size(_txt_) if _width <= max_width: continue else: if _txt_.strip() == word.strip(): processed = False break _wrapped_lines_.append(_before) _txt_ = f"{word} " _wrapped_lines_.append(_txt_) if processed: _wrapped_lines_ = [line.strip() for line in _wrapped_lines_] result = "\n".join(_wrapped_lines_).strip() height = len(_wrapped_lines_) * height # logger.warning(f"wrapped text: {result}") return result, height _wrapped_lines_ = [] chars = list(text) _txt_ = "" for word in chars: _txt_ += word _width, _height = get_text_size(_txt_) if _width <= max_width: continue else: _wrapped_lines_.append(_txt_) _txt_ = "" _wrapped_lines_.append(_txt_) result = "\n".join(_wrapped_lines_).strip() height = len(_wrapped_lines_) * height logger.debug(f"换行文本: {result}") return result, height @contextmanager def manage_clip(clip): """ 视频片段资源管理器 Args: clip: 视频片段对象 Yields: VideoFileClip: 视频片段对象 """ try: yield clip finally: clip.close() del clip def resize_video_with_padding(clip, target_width: int, target_height: int): """ 调整视频尺寸并添加黑边 Args: clip: 视频片段 target_width: 目标宽度 target_height: 目标高度 Returns: CompositeVideoClip: 调整尺寸后的视频 """ clip_ratio = clip.w / clip.h target_ratio = target_width / target_height if clip_ratio == target_ratio: return clip.resize((target_width, target_height)) if clip_ratio > target_ratio: scale_factor = target_width / clip.w else: scale_factor = target_height / clip.h new_width = int(clip.w * scale_factor) new_height = int(clip.h * scale_factor) clip_resized = clip.resize(newsize=(new_width, new_height)) background = ColorClip( size=(target_width, target_height), color=(0, 0, 0) ).set_duration(clip.duration) return CompositeVideoClip([ background, clip_resized.set_position("center") ]) def loop_audio_clip(audio_clip: AudioFileClip, target_duration: float) -> AudioFileClip: """ 循环音频片段直到达到目标时长 参数: audio_clip: 原始音频片段 target_duration: 目标时长(秒) 返回: 循环后的音频片段 """ # 计算需要循环的次数 loops_needed = int(target_duration / audio_clip.duration) + 1 # 创建足够长的音频 extended_audio = audio_clip for _ in range(loops_needed - 1): extended_audio = CompositeAudioClip([ extended_audio, audio_clip.set_start(extended_audio.duration) ]) # 裁剪到目标时长 return extended_audio.subclip(0, target_duration) def calculate_subtitle_position(position, video_height: int, text_height: int = 0) -> tuple: """ 计算字幕在视频中的具体位置 Args: position: 位置配置,可以是 SubtitlePosition 枚举值或表示距顶部百分比的浮点数 video_height: 视频高度 text_height: 字幕文本高度 Returns: tuple: (x, y) 坐标 """ margin = 50 # 字幕距离边缘的边距 if isinstance(position, (int, float)): # 百分比位置 return ('center', int(video_height * position)) # 预设位置 if position == SubtitlePosition.TOP: return ('center', margin) elif position == SubtitlePosition.CENTER: return ('center', video_height // 2) elif position == SubtitlePosition.BOTTOM: return ('center', video_height - margin - text_height) # 默认底部 return ('center', video_height - margin - text_height) def generate_video_v3( video_path: str, subtitle_style: dict, volume_config: dict, subtitle_path: Optional[str] = None, bgm_path: Optional[str] = None, narration_path: Optional[str] = None, output_path: str = "output.mp4", font_path: Optional[str] = None, subtitle_enabled: bool = True ) -> None: """ 合并视频素材,包括视频、字幕、BGM和解说音频 参数: video_path: 原视频文件路径 subtitle_path: SRT字幕文件路径(可选) bgm_path: 背景音乐文件路径(可选) narration_path: 解说音频文件路径(可选) output_path: 输出文件路径 volume_config: 音量配置字典,可包含以下键: - original: 原声音量(0-1),默认1.0 - bgm: BGM音量(0-1),默认0.3 - narration: 解说音量(0-1),默认1.0 subtitle_enabled: 是否启用字幕,默认True subtitle_style: 字幕样式配置字典,可包含以下键: - font: 字体名称 - fontsize: 字体大小 - color: 字体颜色 - stroke_color: 描边颜色 - stroke_width: 描边宽度 - bg_color: 背景色 - position: 位置支持 SubtitlePosition 枚举值或 0-1 之间的浮点数(表示距顶部的百分比) - method: 文字渲染方法 font_path: 字体文件路径(.ttf/.otf 等格式) """ # 检查视频文件是否存在 if not os.path.exists(video_path): raise FileNotFoundError(f"视频文件不存在: {video_path}") # 加载视频 video = VideoFileClip(video_path) subtitle_clips = [] # 处理字幕(如果启用且提供)- 修复字幕开关bug if subtitle_enabled and subtitle_path: if os.path.exists(subtitle_path): # 检查字体文件 if font_path and not os.path.exists(font_path): logger.warning(f"警告:字体文件不存在: {font_path}") try: subs = pysrt.open(subtitle_path) logger.info(f"读取到 {len(subs)} 条字幕") for index, sub in enumerate(subs): start_time = sub.start.ordinal / 1000 end_time = sub.end.ordinal / 1000 try: # 检查字幕文本是否为空 if not sub.text or sub.text.strip() == '': logger.info(f"警告:第 {index + 1} 条字幕内容为空,已跳过") continue # 处理字幕文本:确保是字符串,并处理可能的列表情况 if isinstance(sub.text, (list, tuple)): subtitle_text = ' '.join(str(item) for item in sub.text if item is not None) else: subtitle_text = str(sub.text) subtitle_text = subtitle_text.strip() if not subtitle_text: logger.info(f"警告:第 {index + 1} 条字幕处理后为空,已跳过") continue # 创建临时 TextClip 来获取文本高度 temp_clip = TextClip( subtitle_text, font=font_path, fontsize=subtitle_style['fontsize'], color=subtitle_style['color'] ) text_height = temp_clip.h temp_clip.close() # 计算字幕位置 position = calculate_subtitle_position( subtitle_style['position'], video.h, text_height ) # 创建最终的 TextClip text_clip = (TextClip( subtitle_text, font=font_path, fontsize=subtitle_style['fontsize'], color=subtitle_style['color'] ) .set_position(position) .set_duration(end_time - start_time) .set_start(start_time)) subtitle_clips.append(text_clip) except Exception as e: logger.error(f"警告:创建第 {index + 1} 条字幕时出错: {traceback.format_exc()}") logger.info(f"成功创建 {len(subtitle_clips)} 条字幕剪辑") except Exception as e: logger.info(f"警告:处理字幕文件时出错: {str(e)}") else: logger.warning(f"字幕文件不存在: {subtitle_path}") elif not subtitle_enabled: logger.info("字幕已禁用,跳过字幕处理") elif not subtitle_path: logger.info("未提供字幕文件路径,跳过字幕处理") # 合并音频 audio_clips = [] # 添加原声(设置音量) logger.info(f"音量配置详情: {volume_config}") if video.audio is not None: original_volume = volume_config['original'] logger.info(f"应用原声音量: {original_volume}") original_audio = video.audio.volumex(original_volume) audio_clips.append(original_audio) logger.info("原声音频已添加到合成列表") else: logger.warning("视频没有音轨,无法添加原声") # 添加BGM(如果提供) if bgm_path: logger.info(f"添加背景音乐: {bgm_path}") bgm = AudioFileClip(bgm_path) if bgm.duration < video.duration: bgm = loop_audio_clip(bgm, video.duration) else: bgm = bgm.subclip(0, video.duration) bgm_volume = volume_config['bgm'] logger.info(f"应用BGM音量: {bgm_volume}") bgm = bgm.volumex(bgm_volume) audio_clips.append(bgm) # 添加解说音频(如果提供) if narration_path: logger.info(f"添加解说音频: {narration_path}") narration_volume = volume_config['narration'] logger.info(f"应用解说音量: {narration_volume}") narration = AudioFileClip(narration_path).volumex(narration_volume) audio_clips.append(narration) # 合成最终视频(包含字幕) if subtitle_clips: final_video = CompositeVideoClip([video] + subtitle_clips, size=video.size) else: logger.info("警告:没有字幕被添加到视频中") final_video = video if audio_clips: logger.info(f"合成音频轨道,共 {len(audio_clips)} 个音频片段") final_audio = CompositeAudioClip(audio_clips) final_video = final_video.set_audio(final_audio) logger.info("音频合成完成") else: logger.warning("没有音频轨道需要合成") # 导出视频 - 使用优化的编码器 logger.info("开始导出视频...") # 获取最优编码器 from app.utils import ffmpeg_utils optimal_encoder = ffmpeg_utils.get_optimal_ffmpeg_encoder() # 根据编码器类型设置参数 ffmpeg_params = [] if "nvenc" in optimal_encoder: ffmpeg_params = ['-preset', 'medium', '-profile:v', 'high'] elif "videotoolbox" in optimal_encoder: ffmpeg_params = ['-profile:v', 'high'] elif "qsv" in optimal_encoder: ffmpeg_params = ['-preset', 'medium'] elif "vaapi" in optimal_encoder: ffmpeg_params = ['-profile', '100'] elif optimal_encoder == "libx264": ffmpeg_params = ['-preset', 'medium', '-crf', '23'] try: final_video.write_videofile( output_path, codec=optimal_encoder, audio_codec='aac', fps=video.fps, ffmpeg_params=ffmpeg_params ) logger.info(f"视频已导出到: {output_path} (使用编码器: {optimal_encoder})") except Exception as e: logger.warning(f"使用 {optimal_encoder} 编码器失败: {str(e)}, 尝试软件编码") # 降级到软件编码 final_video.write_videofile( output_path, codec='libx264', audio_codec='aac', fps=video.fps, ffmpeg_params=['-preset', 'medium', '-crf', '23'] ) logger.info(f"视频已导出到: {output_path} (使用软件编码)") # 清理资源 video.close() for clip in subtitle_clips: clip.close() if bgm_path: bgm.close() if narration_path: narration.close() ================================================ FILE: app/services/video_service.py ================================================ import os from uuid import uuid4 from loguru import logger from typing import Dict, List, Optional, Tuple from app.services import material class VideoService: @staticmethod async def crop_video( video_path: str, video_script: List[dict] ) -> Tuple[str, Dict[str, str]]: """ 裁剪视频服务 Args: video_path: 视频文件路径 video_script: 视频脚本列表 Returns: Tuple[str, Dict[str, str]]: (task_id, 裁剪后的视频片段字典) 视频片段字典格式: {timestamp: video_path} """ try: task_id = str(uuid4()) # 从脚本中提取时间戳列表 time_list = [scene['timestamp'] for scene in video_script] # 调用裁剪服务 subclip_videos = material.clip_videos( task_id=task_id, timestamp_terms=time_list, origin_video=video_path ) if subclip_videos is None: raise ValueError("裁剪视频失败") # 更新脚本中的视频路径 for scene in video_script: try: scene['path'] = subclip_videos[scene['timestamp']] except KeyError as err: logger.error(f"更新视频路径失败: {err}") logger.debug(f"裁剪视频成功,共生成 {len(time_list)} 个视频片段") logger.debug(f"视频片段路径: {subclip_videos}") return task_id, subclip_videos except Exception as e: logger.exception("裁剪视频失败") raise ================================================ FILE: app/services/voice.py ================================================ import os import re import json import traceback import edge_tts import asyncio import requests import uuid from loguru import logger from typing import List, Union, Tuple from datetime import datetime from xml.sax.saxutils import unescape from edge_tts import submaker, SubMaker # from edge_tts.submaker import mktimestamp # 函数可能不存在,我们自己实现 from moviepy.video.tools import subtitles try: from moviepy import AudioFileClip MOVIEPY_AVAILABLE = True except ImportError: MOVIEPY_AVAILABLE = False logger.warning("moviepy 未安装,将使用估算方法计算音频时长") import time from app.config import config from app.utils import utils def mktimestamp(time_seconds: float) -> str: """ 将秒数转换为 SRT 时间戳格式 Args: time_seconds: 时间(秒) Returns: str: SRT 格式的时间戳,如 "00:01:23.456" """ hours = int(time_seconds // 3600) minutes = int((time_seconds % 3600) // 60) seconds = time_seconds % 60 return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}" def new_sub_maker() -> SubMaker: """创建兼容新旧 edge-tts API 的 SubMaker。""" sub_maker = SubMaker() if not hasattr(sub_maker, "subs"): sub_maker.subs = [] if not hasattr(sub_maker, "offset"): sub_maker.offset = [] return sub_maker def add_subtitle_event( sub_maker: SubMaker, start_offset: int, end_offset: int, text: str, boundary_type: str = "WordBoundary", ) -> None: """向 SubMaker 写入项目兼容的字幕事件。""" if hasattr(sub_maker, "feed"): duration = max(0, end_offset - start_offset) try: sub_maker.feed( { "type": boundary_type, "offset": start_offset, "duration": duration, "text": text, } ) except Exception: pass sub_maker.subs.append(text) sub_maker.offset.append((start_offset, end_offset)) def get_all_azure_voices(filter_locals=None) -> list[str]: if filter_locals is None: filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW", "vi-VN"] voices_str = """ Name: af-ZA-AdriNeural Gender: Female Name: af-ZA-WillemNeural Gender: Male Name: am-ET-AmehaNeural Gender: Male Name: am-ET-MekdesNeural Gender: Female Name: ar-AE-FatimaNeural Gender: Female Name: ar-AE-HamdanNeural Gender: Male Name: ar-BH-AliNeural Gender: Male Name: ar-BH-LailaNeural Gender: Female Name: ar-DZ-AminaNeural Gender: Female Name: ar-DZ-IsmaelNeural Gender: Male Name: ar-EG-SalmaNeural Gender: Female Name: ar-EG-ShakirNeural Gender: Male Name: ar-IQ-BasselNeural Gender: Male Name: ar-IQ-RanaNeural Gender: Female Name: ar-JO-SanaNeural Gender: Female Name: ar-JO-TaimNeural Gender: Male Name: ar-KW-FahedNeural Gender: Male Name: ar-KW-NouraNeural Gender: Female Name: ar-LB-LaylaNeural Gender: Female Name: ar-LB-RamiNeural Gender: Male Name: ar-LY-ImanNeural Gender: Female Name: ar-LY-OmarNeural Gender: Male Name: ar-MA-JamalNeural Gender: Male Name: ar-MA-MounaNeural Gender: Female Name: ar-OM-AbdullahNeural Gender: Male Name: ar-OM-AyshaNeural Gender: Female Name: ar-QA-AmalNeural Gender: Female Name: ar-QA-MoazNeural Gender: Male Name: ar-SA-HamedNeural Gender: Male Name: ar-SA-ZariyahNeural Gender: Female Name: ar-SY-AmanyNeural Gender: Female Name: ar-SY-LaithNeural Gender: Male Name: ar-TN-HediNeural Gender: Male Name: ar-TN-ReemNeural Gender: Female Name: ar-YE-MaryamNeural Gender: Female Name: ar-YE-SalehNeural Gender: Male Name: az-AZ-BabekNeural Gender: Male Name: az-AZ-BanuNeural Gender: Female Name: bg-BG-BorislavNeural Gender: Male Name: bg-BG-KalinaNeural Gender: Female Name: bn-BD-NabanitaNeural Gender: Female Name: bn-BD-PradeepNeural Gender: Male Name: bn-IN-BashkarNeural Gender: Male Name: bn-IN-TanishaaNeural Gender: Female Name: bs-BA-GoranNeural Gender: Male Name: bs-BA-VesnaNeural Gender: Female Name: ca-ES-EnricNeural Gender: Male Name: ca-ES-JoanaNeural Gender: Female Name: cs-CZ-AntoninNeural Gender: Male Name: cs-CZ-VlastaNeural Gender: Female Name: cy-GB-AledNeural Gender: Male Name: cy-GB-NiaNeural Gender: Female Name: da-DK-ChristelNeural Gender: Female Name: da-DK-JeppeNeural Gender: Male Name: de-AT-IngridNeural Gender: Female Name: de-AT-JonasNeural Gender: Male Name: de-CH-JanNeural Gender: Male Name: de-CH-LeniNeural Gender: Female Name: de-DE-AmalaNeural Gender: Female Name: de-DE-ConradNeural Gender: Male Name: de-DE-FlorianMultilingualNeural Gender: Male Name: de-DE-KatjaNeural Gender: Female Name: de-DE-KillianNeural Gender: Male Name: de-DE-SeraphinaMultilingualNeural Gender: Female Name: el-GR-AthinaNeural Gender: Female Name: el-GR-NestorasNeural Gender: Male Name: en-AU-NatashaNeural Gender: Female Name: en-AU-WilliamNeural Gender: Male Name: en-CA-ClaraNeural Gender: Female Name: en-CA-LiamNeural Gender: Male Name: en-GB-LibbyNeural Gender: Female Name: en-GB-MaisieNeural Gender: Female Name: en-GB-RyanNeural Gender: Male Name: en-GB-SoniaNeural Gender: Female Name: en-GB-ThomasNeural Gender: Male Name: en-HK-SamNeural Gender: Male Name: en-HK-YanNeural Gender: Female Name: en-IE-ConnorNeural Gender: Male Name: en-IE-EmilyNeural Gender: Female Name: en-IN-NeerjaExpressiveNeural Gender: Female Name: en-IN-NeerjaNeural Gender: Female Name: en-IN-PrabhatNeural Gender: Male Name: en-KE-AsiliaNeural Gender: Female Name: en-KE-ChilembaNeural Gender: Male Name: en-NG-AbeoNeural Gender: Male Name: en-NG-EzinneNeural Gender: Female Name: en-NZ-MitchellNeural Gender: Male Name: en-NZ-MollyNeural Gender: Female Name: en-PH-JamesNeural Gender: Male Name: en-PH-RosaNeural Gender: Female Name: en-SG-LunaNeural Gender: Female Name: en-SG-WayneNeural Gender: Male Name: en-TZ-ElimuNeural Gender: Male Name: en-TZ-ImaniNeural Gender: Female Name: en-US-AnaNeural Gender: Female Name: en-US-AndrewNeural Gender: Male Name: en-US-AriaNeural Gender: Female Name: en-US-AvaNeural Gender: Female Name: en-US-BrianNeural Gender: Male Name: en-US-ChristopherNeural Gender: Male Name: en-US-EmmaNeural Gender: Female Name: en-US-EricNeural Gender: Male Name: en-US-GuyNeural Gender: Male Name: en-US-JennyNeural Gender: Female Name: en-US-MichelleNeural Gender: Female Name: en-US-RogerNeural Gender: Male Name: en-US-SteffanNeural Gender: Male Name: en-ZA-LeahNeural Gender: Female Name: en-ZA-LukeNeural Gender: Male Name: es-AR-ElenaNeural Gender: Female Name: es-AR-TomasNeural Gender: Male Name: es-BO-MarceloNeural Gender: Male Name: es-BO-SofiaNeural Gender: Female Name: es-CL-CatalinaNeural Gender: Female Name: es-CL-LorenzoNeural Gender: Male Name: es-CO-GonzaloNeural Gender: Male Name: es-CO-SalomeNeural Gender: Female Name: es-CR-JuanNeural Gender: Male Name: es-CR-MariaNeural Gender: Female Name: es-CU-BelkysNeural Gender: Female Name: es-CU-ManuelNeural Gender: Male Name: es-DO-EmilioNeural Gender: Male Name: es-DO-RamonaNeural Gender: Female Name: es-EC-AndreaNeural Gender: Female Name: es-EC-LuisNeural Gender: Male Name: es-ES-AlvaroNeural Gender: Male Name: es-ES-ElviraNeural Gender: Female Name: es-ES-XimenaNeural Gender: Female Name: es-GQ-JavierNeural Gender: Male Name: es-GQ-TeresaNeural Gender: Female Name: es-GT-AndresNeural Gender: Male Name: es-GT-MartaNeural Gender: Female Name: es-HN-CarlosNeural Gender: Male Name: es-HN-KarlaNeural Gender: Female Name: es-MX-DaliaNeural Gender: Female Name: es-MX-JorgeNeural Gender: Male Name: es-NI-FedericoNeural Gender: Male Name: es-NI-YolandaNeural Gender: Female Name: es-PA-MargaritaNeural Gender: Female Name: es-PA-RobertoNeural Gender: Male Name: es-PE-AlexNeural Gender: Male Name: es-PE-CamilaNeural Gender: Female Name: es-PR-KarinaNeural Gender: Female Name: es-PR-VictorNeural Gender: Male Name: es-PY-MarioNeural Gender: Male Name: es-PY-TaniaNeural Gender: Female Name: es-SV-LorenaNeural Gender: Female Name: es-SV-RodrigoNeural Gender: Male Name: es-US-AlonsoNeural Gender: Male Name: es-US-PalomaNeural Gender: Female Name: es-UY-MateoNeural Gender: Male Name: es-UY-ValentinaNeural Gender: Female Name: es-VE-PaolaNeural Gender: Female Name: es-VE-SebastianNeural Gender: Male Name: et-EE-AnuNeural Gender: Female Name: et-EE-KertNeural Gender: Male Name: fa-IR-DilaraNeural Gender: Female Name: fa-IR-FaridNeural Gender: Male Name: fi-FI-HarriNeural Gender: Male Name: fi-FI-NooraNeural Gender: Female Name: fil-PH-AngeloNeural Gender: Male Name: fil-PH-BlessicaNeural Gender: Female Name: fr-BE-CharlineNeural Gender: Female Name: fr-BE-GerardNeural Gender: Male Name: fr-CA-AntoineNeural Gender: Male Name: fr-CA-JeanNeural Gender: Male Name: fr-CA-SylvieNeural Gender: Female Name: fr-CA-ThierryNeural Gender: Male Name: fr-CH-ArianeNeural Gender: Female Name: fr-CH-FabriceNeural Gender: Male Name: fr-FR-DeniseNeural Gender: Female Name: fr-FR-EloiseNeural Gender: Female Name: fr-FR-HenriNeural Gender: Male Name: fr-FR-RemyMultilingualNeural Gender: Male Name: fr-FR-VivienneMultilingualNeural Gender: Female Name: ga-IE-ColmNeural Gender: Male Name: ga-IE-OrlaNeural Gender: Female Name: gl-ES-RoiNeural Gender: Male Name: gl-ES-SabelaNeural Gender: Female Name: gu-IN-DhwaniNeural Gender: Female Name: gu-IN-NiranjanNeural Gender: Male Name: he-IL-AvriNeural Gender: Male Name: he-IL-HilaNeural Gender: Female Name: hi-IN-MadhurNeural Gender: Male Name: hi-IN-SwaraNeural Gender: Female Name: hr-HR-GabrijelaNeural Gender: Female Name: hr-HR-SreckoNeural Gender: Male Name: hu-HU-NoemiNeural Gender: Female Name: hu-HU-TamasNeural Gender: Male Name: id-ID-ArdiNeural Gender: Male Name: id-ID-GadisNeural Gender: Female Name: is-IS-GudrunNeural Gender: Female Name: is-IS-GunnarNeural Gender: Male Name: it-IT-DiegoNeural Gender: Male Name: it-IT-ElsaNeural Gender: Female Name: it-IT-GiuseppeNeural Gender: Male Name: it-IT-IsabellaNeural Gender: Female Name: ja-JP-KeitaNeural Gender: Male Name: ja-JP-NanamiNeural Gender: Female Name: jv-ID-DimasNeural Gender: Male Name: jv-ID-SitiNeural Gender: Female Name: ka-GE-EkaNeural Gender: Female Name: ka-GE-GiorgiNeural Gender: Male Name: kk-KZ-AigulNeural Gender: Female Name: kk-KZ-DauletNeural Gender: Male Name: km-KH-PisethNeural Gender: Male Name: km-KH-SreymomNeural Gender: Female Name: kn-IN-GaganNeural Gender: Male Name: kn-IN-SapnaNeural Gender: Female Name: ko-KR-HyunsuNeural Gender: Male Name: ko-KR-InJoonNeural Gender: Male Name: ko-KR-SunHiNeural Gender: Female Name: lo-LA-ChanthavongNeural Gender: Male Name: lo-LA-KeomanyNeural Gender: Female Name: lt-LT-LeonasNeural Gender: Male Name: lt-LT-OnaNeural Gender: Female Name: lv-LV-EveritaNeural Gender: Female Name: lv-LV-NilsNeural Gender: Male Name: mk-MK-AleksandarNeural Gender: Male Name: mk-MK-MarijaNeural Gender: Female Name: ml-IN-MidhunNeural Gender: Male Name: ml-IN-SobhanaNeural Gender: Female Name: mn-MN-BataaNeural Gender: Male Name: mn-MN-YesuiNeural Gender: Female Name: mr-IN-AarohiNeural Gender: Female Name: mr-IN-ManoharNeural Gender: Male Name: ms-MY-OsmanNeural Gender: Male Name: ms-MY-YasminNeural Gender: Female Name: mt-MT-GraceNeural Gender: Female Name: mt-MT-JosephNeural Gender: Male Name: my-MM-NilarNeural Gender: Female Name: my-MM-ThihaNeural Gender: Male Name: nb-NO-FinnNeural Gender: Male Name: nb-NO-PernilleNeural Gender: Female Name: ne-NP-HemkalaNeural Gender: Female Name: ne-NP-SagarNeural Gender: Male Name: nl-BE-ArnaudNeural Gender: Male Name: nl-BE-DenaNeural Gender: Female Name: nl-NL-ColetteNeural Gender: Female Name: nl-NL-FennaNeural Gender: Female Name: nl-NL-MaartenNeural Gender: Male Name: pl-PL-MarekNeural Gender: Male Name: pl-PL-ZofiaNeural Gender: Female Name: ps-AF-GulNawazNeural Gender: Male Name: ps-AF-LatifaNeural Gender: Female Name: pt-BR-AntonioNeural Gender: Male Name: pt-BR-FranciscaNeural Gender: Female Name: pt-BR-ThalitaNeural Gender: Female Name: pt-PT-DuarteNeural Gender: Male Name: pt-PT-RaquelNeural Gender: Female Name: ro-RO-AlinaNeural Gender: Female Name: ro-RO-EmilNeural Gender: Male Name: ru-RU-DmitryNeural Gender: Male Name: ru-RU-SvetlanaNeural Gender: Female Name: si-LK-SameeraNeural Gender: Male Name: si-LK-ThiliniNeural Gender: Female Name: sk-SK-LukasNeural Gender: Male Name: sk-SK-ViktoriaNeural Gender: Female Name: sl-SI-PetraNeural Gender: Female Name: sl-SI-RokNeural Gender: Male Name: so-SO-MuuseNeural Gender: Male Name: so-SO-UbaxNeural Gender: Female Name: sq-AL-AnilaNeural Gender: Female Name: sq-AL-IlirNeural Gender: Male Name: sr-RS-NicholasNeural Gender: Male Name: sr-RS-SophieNeural Gender: Female Name: su-ID-JajangNeural Gender: Male Name: su-ID-TutiNeural Gender: Female Name: sv-SE-MattiasNeural Gender: Male Name: sv-SE-SofieNeural Gender: Female Name: sw-KE-RafikiNeural Gender: Male Name: sw-KE-ZuriNeural Gender: Female Name: sw-TZ-DaudiNeural Gender: Male Name: sw-TZ-RehemaNeural Gender: Female Name: ta-IN-PallaviNeural Gender: Female Name: ta-IN-ValluvarNeural Gender: Male Name: ta-LK-KumarNeural Gender: Male Name: ta-LK-SaranyaNeural Gender: Female Name: ta-MY-KaniNeural Gender: Female Name: ta-MY-SuryaNeural Gender: Male Name: ta-SG-AnbuNeural Gender: Male Name: ta-SG-VenbaNeural Gender: Female Name: te-IN-MohanNeural Gender: Male Name: te-IN-ShrutiNeural Gender: Female Name: th-TH-NiwatNeural Gender: Male Name: th-TH-PremwadeeNeural Gender: Female Name: tr-TR-AhmetNeural Gender: Male Name: tr-TR-EmelNeural Gender: Female Name: uk-UA-OstapNeural Gender: Male Name: uk-UA-PolinaNeural Gender: Female Name: ur-IN-GulNeural Gender: Female Name: ur-IN-SalmanNeural Gender: Male Name: ur-PK-AsadNeural Gender: Male Name: ur-PK-UzmaNeural Gender: Female Name: uz-UZ-MadinaNeural Gender: Female Name: uz-UZ-SardorNeural Gender: Male Name: vi-VN-HoaiMyNeural Gender: Female Name: vi-VN-NamMinhNeural Gender: Male Name: zh-CN-XiaoxiaoNeural Gender: Female Name: zh-CN-XiaoyiNeural Gender: Female Name: zh-CN-YunjianNeural Gender: Male Name: zh-CN-YunxiNeural Gender: Male Name: zh-CN-YunxiaNeural Gender: Male Name: zh-CN-YunyangNeural Gender: Male Name: zh-CN-liaoning-XiaobeiNeural Gender: Female Name: zh-CN-shaanxi-XiaoniNeural Gender: Female Name: zh-HK-HiuGaaiNeural Gender: Female Name: zh-HK-HiuMaanNeural Gender: Female Name: zh-HK-WanLungNeural Gender: Male Name: zh-TW-HsiaoChenNeural Gender: Female Name: zh-TW-HsiaoYuNeural Gender: Female Name: zh-TW-YunJheNeural Gender: Male Name: zu-ZA-ThandoNeural Gender: Female Name: zu-ZA-ThembaNeural Gender: Male Name: en-US-AvaMultilingualNeural-V2 Gender: Female Name: en-US-AndrewMultilingualNeural-V2 Gender: Male Name: en-US-EmmaMultilingualNeural-V2 Gender: Female Name: en-US-BrianMultilingualNeural-V2 Gender: Male Name: de-DE-FlorianMultilingualNeural-V2 Gender: Male Name: de-DE-SeraphinaMultilingualNeural-V2 Gender: Female Name: fr-FR-RemyMultilingualNeural-V2 Gender: Male Name: fr-FR-VivienneMultilingualNeural-V2 Gender: Female Name: zh-CN-XiaoxiaoMultilingualNeural-V2 Gender: Female Name: zh-CN-YunxiNeural-V2 Gender: Male """.strip() voices = [] name = "" for line in voices_str.split("\n"): line = line.strip() if not line: continue if line.startswith("Name: "): name = line[6:].strip() if line.startswith("Gender: "): gender = line[8:].strip() if name and gender: # voices.append({ # "name": name, # "gender": gender, # }) if filter_locals: for filter_local in filter_locals: if name.lower().startswith(filter_local.lower()): voices.append(f"{name}-{gender}") else: voices.append(f"{name}-{gender}") name = "" voices.sort() return voices def parse_voice_name(name: str): # zh-CN-XiaoyiNeural-Female # zh-CN-YunxiNeural-Male # zh-CN-XiaoxiaoMultilingualNeural-V2-Female name = name.replace("-Female", "").replace("-Male", "").strip() return name def is_azure_v2_voice(voice_name: str): voice_name = parse_voice_name(voice_name) if voice_name.endswith("-V2"): return voice_name.replace("-V2", "").strip() return "" def should_use_azure_speech_services(voice_name: str) -> bool: """判断音色是否应该使用Azure Speech Services""" if not voice_name or is_soulvoice_voice(voice_name): return False voice_name = voice_name.strip() # 如果是带-V2后缀的,肯定是Azure Speech Services if voice_name.endswith("-V2"): return True # 检查是否为Azure官方音色格式 (如: zh-CN-YunzeNeural) # Azure音色通常格式为: [语言]-[地区]-[名称]Neural import re pattern = r'^[a-z]{2}-[A-Z]{2}-\w+Neural$' if re.match(pattern, voice_name): return True return False def tts( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str, tts_engine: str ) -> Union[SubMaker, None]: logger.info(f"使用 TTS 引擎: '{tts_engine}', 语音: '{voice_name}'") if tts_engine == "tencent_tts": logger.info("分发到腾讯云 TTS") return tencent_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "qwen3_tts": logger.info("分发到 Qwen3 TTS", voice_name) return qwen3_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "soulvoice": logger.info("分发到 SoulVoice TTS") return soulvoice_tts(text, voice_name, voice_file, speed=voice_rate) if tts_engine == "azure_speech": if should_use_azure_speech_services(voice_name): logger.info("分发到 Azure Speech Services (V2)") return azure_tts_v2(text, voice_name, voice_file) logger.info("分发到 Edge TTS (Azure V1)") return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) if tts_engine == "edge_tts": logger.info("分发到 Edge TTS") return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) if tts_engine == "indextts2": logger.info("分发到 IndexTTS2") return indextts2_tts(text, voice_name, voice_file, speed=voice_rate) # Fallback for unknown engine - default to azure v1 logger.warning(f"未知的 TTS 引擎: '{tts_engine}', 将默认使用 Edge TTS (Azure V1)。") return azure_tts_v1(text, voice_name, voice_rate, voice_pitch, voice_file) def convert_rate_to_percent(rate: float) -> str: if rate == 1.0: return "+0%" percent = round((rate - 1.0) * 100) if percent > 0: return f"+{percent}%" else: return f"{percent}%" def convert_pitch_to_percent(rate: float) -> str: if rate == 1.0: return "+0Hz" percent = round((rate - 1.0) * 100) if percent > 0: return f"+{percent}Hz" else: return f"{percent}Hz" def get_edge_tts_proxy() -> str | None: """返回 Edge TTS 应使用的代理地址。""" proxy_enabled = config.proxy.get("enabled") if proxy_enabled is False: return None proxy_url = (config.proxy.get("https") or config.proxy.get("http") or "").strip() return proxy_url or None def azure_tts_v1( text: str, voice_name: str, voice_rate: float, voice_pitch: float, voice_file: str ) -> Union[SubMaker, None]: voice_name = parse_voice_name(voice_name) text = text.strip() rate_str = convert_rate_to_percent(voice_rate) pitch_str = convert_pitch_to_percent(voice_pitch) for i in range(3): try: logger.info(f"第 {i+1} 次使用 edge_tts 生成音频") async def _do() -> tuple[SubMaker, bytes]: communicate = edge_tts.Communicate( text, voice_name, rate=rate_str, pitch=pitch_str, boundary="WordBoundary", proxy=get_edge_tts_proxy(), connect_timeout=10, receive_timeout=60, ) sub_maker = new_sub_maker() audio_data = bytes() # 用于存储音频数据 async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_data += chunk["data"] elif chunk["type"] in {"WordBoundary", "SentenceBoundary"}: add_subtitle_event( sub_maker, start_offset=chunk["offset"], end_offset=chunk["offset"] + chunk["duration"], text=chunk["text"], boundary_type=chunk["type"], ) return sub_maker, audio_data # 获取音频数据和字幕信息 sub_maker, audio_data = asyncio.run(_do()) # 验证数据是否有效 if not audio_data: logger.warning("failed, no audio data generated") if i < 2: time.sleep(1) continue if not sub_maker.subs: logger.warning("edge_tts returned audio without boundary events; subtitle timing may be unavailable") # 数据有效,写入文件 with open(voice_file, "wb") as file: file.write(audio_data) return sub_maker except Exception as e: logger.exception(f"生成音频文件时出错: {type(e).__name__}: {str(e)}") if i < 2: time.sleep(1) return None def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]: # 直接使用官方音色名称,不需要V2后缀验证 # Azure Speech Services 的音色名称如: zh-CN-YunzeNeural, en-US-AvaMultilingualNeural processed_voice_name = voice_name.strip() if not processed_voice_name: logger.error(f"invalid voice name: {voice_name} (empty)") raise ValueError(f"invalid voice name: {voice_name} (empty)") text = text.strip() # 检查Azure Speech SDK是否可用 try: import azure.cognitiveservices.speech as speechsdk except ImportError as e: logger.error("Azure Speech SDK 未安装。请运行: pip install azure-cognitiveservices-speech") logger.error("或者使用 Edge TTS 引擎作为替代方案") return None def _format_duration_to_offset(duration) -> int: if isinstance(duration, str): time_obj = datetime.strptime(duration, "%H:%M:%S.%f") milliseconds = ( (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (time_obj.microsecond // 1000) ) return milliseconds * 10000 if isinstance(duration, int): return duration return 0 for i in range(3): try: logger.info(f"start, voice name: {processed_voice_name}, try: {i + 1}") sub_maker = new_sub_maker() def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs): duration = _format_duration_to_offset(str(evt.duration)) offset = _format_duration_to_offset(evt.audio_offset) add_subtitle_event(sub_maker, offset, offset + duration, evt.text) # Creates an instance of a speech config with specified subscription key and service region. speech_key = config.azure.get("speech_key", "") service_region = config.azure.get("speech_region", "") audio_config = speechsdk.audio.AudioOutputConfig( filename=voice_file, use_default_speaker=True ) speech_config = speechsdk.SpeechConfig( subscription=speech_key, region=service_region ) speech_config.speech_synthesis_voice_name = processed_voice_name # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary, # value='true') speech_config.set_property( property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary, value="true", ) speech_config.set_speech_synthesis_output_format( speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3 ) speech_synthesizer = speechsdk.SpeechSynthesizer( audio_config=audio_config, speech_config=speech_config ) speech_synthesizer.synthesis_word_boundary.connect( speech_synthesizer_word_boundary_cb ) result = speech_synthesizer.speak_text_async(text).get() if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: logger.success(f"azure v2 speech synthesis succeeded: {voice_file}") return sub_maker elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details logger.error( f"azure v2 speech synthesis canceled: {cancellation_details.reason}" ) if cancellation_details.reason == speechsdk.CancellationReason.Error: logger.error( f"azure v2 speech synthesis error: {cancellation_details.error_details}" ) if i < 2: # 如果不是最后一次重试,则等待1秒 time.sleep(1) logger.info(f"completed, output file: {voice_file}") except Exception as e: logger.error(f"failed, error: {str(e)}") if i < 2: # 如果不是最后一次重试,则等待1秒 time.sleep(3) return None def _format_text(text: str) -> str: text = text.replace("\n", " ") text = text.replace("\"", " ") text = text.replace("[", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace(")", " ") text = text.replace("(", " ") text = text.replace("{", " ") text = text.replace("}", " ") text = text.strip() return text def create_subtitle_from_multiple(text: str, sub_maker_list: List[SubMaker], list_script: List[dict], subtitle_file: str): """ 根据多个 SubMaker 对象、完整文本和原始脚本创建优化的字幕文件 1. 使用原始脚本中的时间戳 2. 跳过 OST 为 true 的部分 3. 将字幕文件按照标点符号分割成多行 4. 根据完整文本分段,保持原文的语句结构 5. 生成新的字幕文件,时间戳包含小时单位 """ text = _format_text(text) sentences = utils.split_string_by_punctuations(text) def formatter(idx: int, start_time: str, end_time: str, sub_text: str) -> str: return f"{idx}\n{start_time.replace('.', ',')} --> {end_time.replace('.', ',')}\n{sub_text}\n" sub_items = [] sub_index = 0 sentence_index = 0 try: sub_maker_index = 0 for script_item in list_script: if script_item['OST']: continue start_time, end_time = script_item['timestamp'].split('-') if sub_maker_index >= len(sub_maker_list): logger.error(f"Sub maker list index out of range: {sub_maker_index}") break sub_maker = sub_maker_list[sub_maker_index] sub_maker_index += 1 script_duration = utils.time_to_seconds(end_time) - utils.time_to_seconds(start_time) audio_duration = get_audio_duration(sub_maker) time_ratio = script_duration / audio_duration if audio_duration > 0 else 1 current_sub = "" current_start = None current_end = None for offset, sub in zip(sub_maker.offset, sub_maker.subs): sub = unescape(sub).strip() sub_start = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[0] / 10000000 * time_ratio) sub_end = utils.seconds_to_time(utils.time_to_seconds(start_time) + offset[1] / 10000000 * time_ratio) if current_start is None: current_start = sub_start current_end = sub_end current_sub += sub # 检查当前累积的字幕是否匹配下一个句子 while sentence_index < len(sentences) and sentences[sentence_index] in current_sub: sub_index += 1 line = formatter( idx=sub_index, start_time=current_start, end_time=current_end, sub_text=sentences[sentence_index].strip(), ) sub_items.append(line) current_sub = current_sub.replace(sentences[sentence_index], "", 1).strip() current_start = current_end sentence_index += 1 # 如果当前字幕长度超过15个字符,也生成一个新的字幕项 if len(current_sub) > 15: sub_index += 1 line = formatter( idx=sub_index, start_time=current_start, end_time=current_end, sub_text=current_sub.strip(), ) sub_items.append(line) current_sub = "" current_start = current_end # 处理剩余的文本 if current_sub.strip(): sub_index += 1 line = formatter( idx=sub_index, start_time=current_start, end_time=current_end, sub_text=current_sub.strip(), ) sub_items.append(line) if len(sub_items) == 0: logger.error("No subtitle items generated") return with open(subtitle_file, "w", encoding="utf-8") as file: file.write("\n".join(sub_items)) logger.info(f"completed, subtitle file created: {subtitle_file}") except Exception as e: logger.error(f"failed, error: {str(e)}") traceback.print_exc() def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str): """ 优化字幕文件 1. 将字幕文件按照标点符号分割成多行 2. 逐行匹配字幕文件中的文本 3. 生成新的字幕文件 """ text = _format_text(text) def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str: """ 1 00:00:00,000 --> 00:00:02,360 跑步是一项简单易行的运动 """ start_t = mktimestamp(start_time).replace(".", ",") end_t = mktimestamp(end_time).replace(".", ",") return f"{idx}\n" f"{start_t} --> {end_t}\n" f"{sub_text}\n" start_time = -1.0 sub_items = [] sub_index = 0 script_lines = utils.split_string_by_punctuations(text) def match_line(_sub_line: str, _sub_index: int): if len(script_lines) <= _sub_index: return "" _line = script_lines[_sub_index] if _sub_line == _line: return script_lines[_sub_index].strip() _sub_line_ = re.sub(r"[^\w\s]", "", _sub_line) _line_ = re.sub(r"[^\w\s]", "", _line) if _sub_line_ == _line_: return _line_.strip() _sub_line_ = re.sub(r"\W+", "", _sub_line) _line_ = re.sub(r"\W+", "", _line) if _sub_line_ == _line_: return _line.strip() return "" sub_line = "" try: for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)): _start_time, end_time = offset if start_time < 0: start_time = _start_time # 将 100纳秒单位转换为秒 start_time_seconds = start_time / 10000000 end_time_seconds = end_time / 10000000 sub = unescape(sub) sub_line += sub sub_text = match_line(sub_line, sub_index) if sub_text: sub_index += 1 line = formatter( idx=sub_index, start_time=start_time_seconds, end_time=end_time_seconds, sub_text=sub_text, ) sub_items.append(line) start_time = -1.0 sub_line = "" if len(sub_items) == len(script_lines): with open(subtitle_file, "w", encoding="utf-8") as file: file.write("\n".join(sub_items) + "\n") try: sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8") duration = max([tb for ((ta, tb), txt) in sbs]) logger.info( f"已创建字幕文件: {subtitle_file}, duration: {duration}" ) return subtitle_file, duration except Exception as e: logger.error(f"failed, error: {str(e)}") os.remove(subtitle_file) else: logger.error( f"字幕创建失败, 字幕长度: {len(sub_items)}, script_lines len: {len(script_lines)}" f"\nsub_items:{json.dumps(sub_items, indent=4, ensure_ascii=False)}" f"\nscript_lines:{json.dumps(script_lines, indent=4, ensure_ascii=False)}" ) # 返回默认值,避免 None 错误 return subtitle_file, 3.0 except Exception as e: logger.error(f"failed, error: {str(e)}") # 返回默认值,避免 None 错误 return subtitle_file, 3.0 def get_audio_duration(sub_maker: submaker.SubMaker): """ 获取音频时长 """ if not sub_maker.offset: return 0.0 return sub_maker.offset[-1][1] / 10000000 def tts_multiple(task_id: str, list_script: list, voice_name: str, voice_rate: float, voice_pitch: float, tts_engine: str = "azure"): """ 根据JSON文件中的多段文本进行TTS转换 :param task_id: 任务ID :param list_script: 脚本列表 :param voice_name: 语音名称 :param voice_rate: 语音速率 :param tts_engine: TTS 引擎 :return: 生成的音频文件列表 """ voice_name = parse_voice_name(voice_name) output_dir = utils.task_dir(task_id) tts_results = [] for item in list_script: if item['OST'] != 1: # 将时间戳中的冒号替换为下划线 timestamp = item['timestamp'].replace(':', '_') audio_file = os.path.join(output_dir, f"audio_{timestamp}.mp3") subtitle_file = os.path.join(output_dir, f"subtitle_{timestamp}.srt") text = item['narration'] sub_maker = tts( text=text, voice_name=voice_name, voice_rate=voice_rate, voice_pitch=voice_pitch, voice_file=audio_file, tts_engine=tts_engine, ) if sub_maker is None: logger.error(f"无法为时间戳 {timestamp} 生成音频; " f"如果您在中国,请使用VPN; " f"或者使用其他 tts 引擎") continue else: # SoulVoice、Qwen3、IndexTTS2 引擎不生成字幕文件 if is_soulvoice_voice(voice_name) or is_qwen_engine(tts_engine) or tts_engine == "indextts2": # 获取实际音频文件的时长 duration = get_audio_duration_from_file(audio_file) if duration <= 0: # 如果无法获取文件时长,尝试从 SubMaker 获取 duration = get_audio_duration(sub_maker) if duration <= 0: # 最后的 fallback,基于文本长度估算 duration = max(1.0, len(text) / 3.0) logger.warning(f"无法获取音频时长,使用文本估算: {duration:.2f}秒") # 不创建字幕文件 subtitle_file = "" else: _, duration = create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file) tts_results.append({ "_id": item['_id'], "timestamp": item['timestamp'], "audio_file": audio_file, "subtitle_file": subtitle_file, "duration": duration, "text": text, }) logger.info(f"已生成音频文件: {audio_file}") return tts_results def get_audio_duration_from_file(audio_file: str) -> float: """ 获取音频文件的时长(秒) """ if MOVIEPY_AVAILABLE: try: audio_clip = AudioFileClip(audio_file) duration = audio_clip.duration audio_clip.close() return duration except Exception as e: logger.error(f"使用 moviepy 获取音频时长失败: {str(e)}") # Fallback: 使用更准确的估算方法 try: import os file_size = os.path.getsize(audio_file) # 更准确的 MP3 时长估算 # 假设 MP3 平均比特率为 128kbps = 16KB/s # 但实际文件还包含头部信息,所以调整系数 estimated_duration = max(1.0, file_size / 20000) # 调整为更保守的估算 # 对于中文语音,根据文本长度进行二次校正 # 一般中文语音速度约为 3-4 字/秒 logger.warning(f"使用文件大小估算音频时长: {estimated_duration:.2f}秒") return estimated_duration except Exception as e: logger.error(f"获取音频时长失败: {str(e)}") # 如果所有方法都失败,返回一个基于文本长度的估算 return 3.0 # 默认3秒,避免返回0 def parse_soulvoice_voice(voice_name: str) -> str: """ 解析 SoulVoice 语音名称 支持格式: - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr """ if voice_name.startswith("soulvoice:"): return voice_name[10:] # 移除 "soulvoice:" 前缀 return voice_name def parse_tencent_voice(voice_name: str) -> str: """ 解析腾讯云 TTS 语音名称 支持格式:tencent:101001 """ if voice_name.startswith("tencent:"): return voice_name[8:] # 移除 "tencent:" 前缀 return voice_name def parse_qwen3_voice(voice_name: str) -> str: """ 解析 Qwen3 语音名称 """ if isinstance(voice_name, str) and voice_name.startswith("qwen3:"): return voice_name[6:] return voice_name def qwen3_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用通义千问 Qwen3 TTS 生成语音(仅使用 DashScope SDK) """ # 读取配置 tts_qwen_cfg = getattr(config, "tts_qwen", {}) or {} api_key = tts_qwen_cfg.get("api_key", "") model_name = tts_qwen_cfg.get("model_name", "qwen3-tts-flash") if not api_key: logger.error("Qwen3 TTS API key 未配置") return None # 准备参数 voice_type = parse_qwen3_voice(voice_name) safe_speed = float(max(0.5, min(2.0, speed))) text = text.strip() # SDK 调用 try: import dashscope except ImportError: logger.error("未安装 dashscope SDK,请执行: pip install dashscope") return None except Exception as e: logger.error(f"DashScope SDK 初始化失败: {e}") return None # Qwen3 TTS 直接使用英文参数,不需要映射 mapped_voice = voice_type or "Cherry" for i in range(3): try: # 打印详细的请求参数日志 logger.info(f"=== Qwen3 TTS 请求参数 (第 {i+1} 次调用) ===") # 官方推荐:使用 MultiModalConversation.call result = dashscope.MultiModalConversation.call( # 仅支持 qwen-tts 系列模型 model=(model_name or "qwen3-tts-flash"), # 同时显式传入 api_key,并兼容示例中从环境变量读取 api_key=api_key, text=text, voice=mapped_voice ) logger.info(f"Qwen3 TTS API 响应: {result}") audio_bytes: bytes | None = None # 解析返回结果,提取音频URL并下载 try:# 假设 result 是你收到的字符串 audio_url = None if result.output and result.output.audio: audio_url = result.output.audio.url # 从响应中提取音频URL if audio_url: # 直接下载音频文件 response = requests.get(audio_url, timeout=30) response.raise_for_status() audio_bytes = response.content else: logger.warning("API响应中未找到音频URL") except Exception as e: logger.error(f"解析API响应失败: {str(e)}") if not audio_bytes: logger.warning("DashScope SDK 返回空音频数据,重试") if i < 2: time.sleep(1) continue # 写入文件 with open(voice_file, "wb") as f: f.write(audio_bytes) # 估算字幕 sub = new_sub_maker() est_ms = max(800, int(len(text) * 180)) add_subtitle_event(sub, 0, est_ms, text) logger.info(f"Qwen3 TTS 生成成功(DashScope SDK),文件大小: {len(audio_bytes)} 字节") return sub except Exception as e: logger.error(f"DashScope SDK 合成失败: {e}") if i < 2: time.sleep(1) return None def tencent_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用腾讯云 TTS 生成语音 """ try: # 导入腾讯云 SDK from tencentcloud.common import credential from tencentcloud.common.profile.client_profile import ClientProfile from tencentcloud.common.profile.http_profile import HttpProfile from tencentcloud.tts.v20190823 import tts_client, models import base64 except ImportError as e: logger.error(f"腾讯云 SDK 未安装: {e}") return None # 获取腾讯云配置 tencent_config = config.tencent secret_id = tencent_config.get("secret_id") secret_key = tencent_config.get("secret_key") region = tencent_config.get("region", "ap-beijing") if not secret_id or not secret_key: logger.error("腾讯云 TTS 配置不完整,请检查 secret_id 和 secret_key") return None # 解析语音名称 voice_type = parse_tencent_voice(voice_name) # 转换速度参数 (腾讯云支持 -2 到 2 的范围) speed_value = max(-2.0, min(2.0, (speed - 1.0) * 2)) for i in range(3): try: logger.info(f"第 {i+1} 次使用腾讯云 TTS 生成音频") # 创建认证对象 cred = credential.Credential(secret_id, secret_key) # 创建 HTTP 配置 httpProfile = HttpProfile() httpProfile.endpoint = "tts.tencentcloudapi.com" # 创建客户端配置 clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile # 创建客户端 client = tts_client.TtsClient(cred, region, clientProfile) req = models.TextToVoiceRequest() req.Text = text req.SessionId = str(uuid.uuid4()) req.VoiceType = int(voice_type) if voice_type.isdigit() else 101001 req.Speed = speed_value req.SampleRate = 16000 req.Codec = "mp3" req.ProjectId = 0 req.ModelType = 1 req.PrimaryLanguage = 1 req.EnableSubtitle = True # 发送请求 resp = client.TextToVoice(req) # 检查响应 if not resp.Audio: logger.warning(f"腾讯云 TTS 返回空音频数据") if i < 2: time.sleep(1) continue # 解码音频数据 audio_data = base64.b64decode(resp.Audio) # 写入文件 with open(voice_file, "wb") as f: f.write(audio_data) # 创建字幕对象 sub_maker = new_sub_maker() if resp.Subtitles: for sub in resp.Subtitles: start_ms = sub.BeginTime end_ms = sub.EndTime text = sub.Text # 转换为 100ns 单位 add_subtitle_event(sub_maker, start_ms * 10000, end_ms * 10000, text) else: # 如果没有字幕返回,则使用估算作为后备方案 duration_ms = len(text) * 200 add_subtitle_event(sub_maker, 0, duration_ms * 10000, text) logger.info(f"腾讯云 TTS 生成成功,文件大小: {len(audio_data)} 字节") return sub_maker except Exception as e: logger.error(f"腾讯云 TTS 生成音频时出错: {str(e)}") if i < 2: time.sleep(1) return None def soulvoice_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用 SoulVoice API 进行文本转语音 Args: text: 要转换的文本 voice_name: 语音名称 voice_file: 输出音频文件路径 speed: 语音速度 Returns: SubMaker: 包含时间戳信息的字幕制作器,失败时返回 None """ # 获取配置 api_key = config.soulvoice.get("api_key", "") api_url = config.soulvoice.get("api_url", "https://tts.scsmtech.cn/tts") default_model = config.soulvoice.get("model", "FunAudioLLM/CosyVoice2-0.5B") if not api_key: logger.error("SoulVoice API key 未配置") return None # 解析语音名称 parsed_voice = parse_soulvoice_voice(voice_name) # 准备请求数据 headers = { 'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json' } data = { 'text': text.strip(), 'model': default_model, 'voice': parsed_voice, 'speed': speed } # 重试机制 for attempt in range(3): try: logger.info(f"第 {attempt + 1} 次调用 SoulVoice API") # 设置代理 proxies = {} if config.proxy.get("http"): proxies = { 'http': config.proxy.get("http"), 'https': config.proxy.get("https", config.proxy.get("http")) } # 调用 API response = requests.post( api_url, headers=headers, json=data, proxies=proxies, timeout=60 ) if response.status_code == 200: # 保存音频文件 with open(voice_file, 'wb') as f: f.write(response.content) logger.info(f"SoulVoice TTS 成功生成音频: {voice_file}") # SoulVoice 不支持精确字幕生成,返回简单的 SubMaker 对象 sub_maker = new_sub_maker() sub_maker.subs = [text] # 整个文本作为一个段落 sub_maker.offset = [(0, 0)] # 占位时间戳 return sub_maker else: logger.error(f"SoulVoice API 调用失败: {response.status_code} - {response.text}") except requests.exceptions.Timeout: logger.error(f"SoulVoice API 调用超时 (尝试 {attempt + 1}/3)") except requests.exceptions.RequestException as e: logger.error(f"SoulVoice API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") except Exception as e: logger.error(f"SoulVoice TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") if attempt < 2: # 不是最后一次尝试 time.sleep(2) # 等待2秒后重试 logger.error("SoulVoice TTS 生成失败,已达到最大重试次数") return None def is_soulvoice_voice(voice_name: str) -> bool: """ 检查是否为 SoulVoice 语音 """ return voice_name.startswith("soulvoice:") or voice_name.startswith("speech:") def is_qwen_engine(tts_engine: str) -> bool: return tts_engine == "qwen3_tts" def parse_soulvoice_voice(voice_name: str) -> str: """ 解析 SoulVoice 语音名称 支持格式: - soulvoice:speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr - speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr """ if voice_name.startswith("soulvoice:"): return voice_name[10:] # 移除 "soulvoice:" 前缀 return voice_name def parse_indextts2_voice(voice_name: str) -> str: """ 解析 IndexTTS2 语音名称 支持格式:indextts2:reference_audio_path 返回参考音频文件路径 """ if voice_name.startswith("indextts2:"): return voice_name[10:] # 移除 "indextts2:" 前缀 return voice_name def indextts2_tts(text: str, voice_name: str, voice_file: str, speed: float = 1.0) -> Union[SubMaker, None]: """ 使用 IndexTTS2 API 进行零样本语音克隆 Args: text: 要转换的文本 voice_name: 参考音频路径(格式:indextts2:path/to/audio.wav) voice_file: 输出音频文件路径 speed: 语音速度(此引擎暂不支持速度调节) Returns: SubMaker: 包含时间戳信息的字幕制作器,失败时返回 None """ # 获取配置 api_url = config.indextts2.get("api_url", "http://192.168.3.6:8081/tts") infer_mode = config.indextts2.get("infer_mode", "普通推理") temperature = config.indextts2.get("temperature", 1.0) top_p = config.indextts2.get("top_p", 0.8) top_k = config.indextts2.get("top_k", 30) do_sample = config.indextts2.get("do_sample", True) num_beams = config.indextts2.get("num_beams", 3) repetition_penalty = config.indextts2.get("repetition_penalty", 10.0) # 解析参考音频路径 reference_audio_path = parse_indextts2_voice(voice_name) if not reference_audio_path or not os.path.exists(reference_audio_path): logger.error(f"IndexTTS2 参考音频文件不存在: {reference_audio_path}") return None # 准备请求数据 files = { 'prompt_audio': open(reference_audio_path, 'rb') } data = { 'text': text.strip(), 'infer_mode': infer_mode, 'temperature': temperature, 'top_p': top_p, 'top_k': top_k, 'do_sample': do_sample, 'num_beams': num_beams, 'repetition_penalty': repetition_penalty, } # 重试机制 for attempt in range(3): try: logger.info(f"第 {attempt + 1} 次调用 IndexTTS2 API") # 设置代理 proxies = {} if config.proxy.get("http"): proxies = { 'http': config.proxy.get("http"), 'https': config.proxy.get("https", config.proxy.get("http")) } # 调用 API response = requests.post( api_url, files=files, data=data, proxies=proxies, timeout=120 # IndexTTS2 推理可能需要较长时间 ) if response.status_code == 200: # 保存音频文件 with open(voice_file, 'wb') as f: f.write(response.content) logger.info(f"IndexTTS2 成功生成音频: {voice_file}, 大小: {len(response.content)} 字节") # IndexTTS2 不支持精确字幕生成,返回简单的 SubMaker 对象 sub_maker = new_sub_maker() # 估算音频时长(基于文本长度) estimated_duration_ms = max(1000, int(len(text) * 200)) add_subtitle_event(sub_maker, 0, estimated_duration_ms * 10000, text) return sub_maker else: logger.error(f"IndexTTS2 API 调用失败: {response.status_code} - {response.text}") except requests.exceptions.Timeout: logger.error(f"IndexTTS2 API 调用超时 (尝试 {attempt + 1}/3)") except requests.exceptions.RequestException as e: logger.error(f"IndexTTS2 API 网络错误: {str(e)} (尝试 {attempt + 1}/3)") except Exception as e: logger.error(f"IndexTTS2 TTS 处理错误: {str(e)} (尝试 {attempt + 1}/3)") finally: # 确保关闭文件 try: files['prompt_audio'].close() except: pass if attempt < 2: # 不是最后一次尝试 time.sleep(2) # 等待2秒后重试 # 重新打开文件用于下次重试 if attempt < 2: try: files['prompt_audio'] = open(reference_audio_path, 'rb') except: pass logger.error("IndexTTS2 TTS 生成失败,已达到最大重试次数") return None ================================================ FILE: app/services/youtube_service.py ================================================ import yt_dlp import os from typing import List, Dict, Optional, Tuple from loguru import logger from uuid import uuid4 from app.utils import utils from app.services import video as VideoService class YoutubeService: def __init__(self): self.supported_formats = ['mp4', 'mkv', 'webm', 'flv', 'avi'] def _get_video_formats(self, url: str) -> List[Dict]: """获取视频可用的格式列表""" ydl_opts = { 'quiet': True, 'no_warnings': True } try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) formats = info.get('formats', []) format_list = [] for f in formats: format_info = { 'format_id': f.get('format_id', 'N/A'), 'ext': f.get('ext', 'N/A'), 'resolution': f.get('format_note', 'N/A'), 'filesize': f.get('filesize', 'N/A'), 'vcodec': f.get('vcodec', 'N/A'), 'acodec': f.get('acodec', 'N/A') } format_list.append(format_info) return format_list except Exception as e: logger.error(f"获取视频格式失败: {str(e)}") raise def _validate_format(self, output_format: str) -> None: """验证输出格式是否支持""" if output_format.lower() not in self.supported_formats: raise ValueError( f"不支持的视频格式: {output_format}。" f"支持的格式: {', '.join(self.supported_formats)}" ) async def download_video( self, url: str, resolution: str, output_format: str = 'mp4', rename: Optional[str] = None ) -> Tuple[str, str, str]: """ 下载指定分辨率的视频 Args: url: YouTube视频URL resolution: 目标分辨率 ('2160p', '1440p', '1080p', '720p' etc.) 注意:对于类似'1080p60'的输入会被处理为'1080p' output_format: 输出视频格式 rename: 可选的重命名 Returns: Tuple[str, str, str]: (task_id, output_path, filename) """ try: task_id = str(uuid4()) self._validate_format(output_format) # 标准化分辨率格式 base_resolution = resolution.split('p')[0] + 'p' # 获取所有可用格式 formats = self._get_video_formats(url) # 查找指定分辨率的最佳视频格式 target_format = None for fmt in formats: fmt_resolution = fmt['resolution'] # 将格式的分辨率也标准化后进行比较 if fmt_resolution != 'N/A': fmt_base_resolution = fmt_resolution.split('p')[0] + 'p' if fmt_base_resolution == base_resolution and fmt['vcodec'] != 'none': target_format = fmt break if target_format is None: # 收集可用分辨率时也进行标准化 available_resolutions = set( fmt['resolution'].split('p')[0] + 'p' for fmt in formats if fmt['resolution'] != 'N/A' and fmt['vcodec'] != 'none' ) raise ValueError( f"未找到 {base_resolution} 分辨率的视频。" f"可用分辨率: {', '.join(sorted(available_resolutions))}" ) # 创建输出目录 output_dir = utils.video_dir() os.makedirs(output_dir, exist_ok=True) # 设置下载选项 if rename: # 如果指定了重命名,直接使用新名字 filename = f"{rename}.{output_format}" output_template = os.path.join(output_dir, filename) else: # 否则使用任务ID和原标题 output_template = os.path.join(output_dir, f'{task_id}_%(title)s.%(ext)s') ydl_opts = { 'format': f"{target_format['format_id']}+bestaudio[ext=m4a]/best", 'outtmpl': output_template, 'merge_output_format': output_format.lower(), 'postprocessors': [{ 'key': 'FFmpegVideoConvertor', 'preferedformat': output_format.lower(), }] } # 执行下载 with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=True) if rename: # 如果指定了重命名,使用新文件名 output_path = output_template filename = os.path.basename(output_path) else: # 否则使用原始标题 video_title = info.get('title', task_id) filename = f"{task_id}_{video_title}.{output_format}" output_path = os.path.join(output_dir, filename) logger.info(f"视频下载成功: {output_path}") return task_id, output_path, filename except Exception as e: logger.exception("下载视频失败") raise ================================================ FILE: app/utils/check_script.py ================================================ import json import re from typing import Dict, Any def check_format(script_content: str) -> Dict[str, Any]: """检查脚本格式 Args: script_content: 脚本内容 Returns: Dict: {'success': bool, 'message': str, 'details': str} """ try: # 检查是否为有效的JSON data = json.loads(script_content) # 检查是否为列表 if not isinstance(data, list): return { 'success': False, 'message': '脚本必须是JSON数组格式', 'details': '正确格式应该是: [{"_id": 1, "timestamp": "...", ...}, ...]' } # 检查数组不能为空 if len(data) == 0: return { 'success': False, 'message': '脚本数组不能为空', 'details': '至少需要包含一个脚本片段' } # 检查每个片段 for i, clip in enumerate(data): # 检查是否为对象类型 if not isinstance(clip, dict): return { 'success': False, 'message': f'第{i+1}个元素必须是对象类型', 'details': f'当前类型: {type(clip).__name__}' } # 检查必需字段 required_fields = ['_id', 'timestamp', 'picture', 'narration', 'OST'] for field in required_fields: if field not in clip: return { 'success': False, 'message': f'第{i+1}个片段缺少必需字段: {field}', 'details': f'必需字段: {", ".join(required_fields)}' } # 验证 _id 字段 if not isinstance(clip['_id'], int) or clip['_id'] <= 0: return { 'success': False, 'message': f'第{i+1}个片段的_id必须是正整数', 'details': f'当前值: {clip["_id"]} (类型: {type(clip["_id"]).__name__})' } # 验证 timestamp 字段格式 timestamp_pattern = r'^\d{2}:\d{2}:\d{2},\d{3}-\d{2}:\d{2}:\d{2},\d{3}$' if not isinstance(clip['timestamp'], str) or not re.match(timestamp_pattern, clip['timestamp']): return { 'success': False, 'message': f'第{i+1}个片段的timestamp格式错误', 'details': f'正确格式: "HH:MM:SS,mmm-HH:MM:SS,mmm",示例: "00:00:00,600-00:00:07,559"' } # 验证 picture 字段 if not isinstance(clip['picture'], str) or not clip['picture'].strip(): return { 'success': False, 'message': f'第{i+1}个片段的picture必须是非空字符串', 'details': f'当前值: {clip.get("picture", "未定义")}' } # 验证 narration 字段 if not isinstance(clip['narration'], str) or not clip['narration'].strip(): return { 'success': False, 'message': f'第{i+1}个片段的narration必须是非空字符串', 'details': f'当前值: {clip.get("narration", "未定义")}' } # 验证 OST 字段 if not isinstance(clip['OST'], int): return { 'success': False, 'message': f'第{i+1}个片段的OST必须是整数', 'details': f'当前值: {clip["OST"]} (类型: {type(clip["OST"]).__name__}),常用值: 0, 1, 2' } return { 'success': True, 'message': '脚本格式检查通过', 'details': f'共验证 {len(data)} 个脚本片段,格式正确' } except json.JSONDecodeError as e: return { 'success': False, 'message': f'JSON格式错误: {str(e)}', 'details': '请检查JSON语法,确保所有括号、引号、逗号正确' } except Exception as e: return { 'success': False, 'message': f'检查过程中发生错误: {str(e)}', 'details': '请联系技术支持' } ================================================ FILE: app/utils/ffmpeg_utils.py ================================================ """ FFmpeg 工具模块 - 提供 FFmpeg 相关的工具函数,特别是硬件加速检测 优化多平台兼容性,支持渐进式降级和智能错误处理 """ import os import platform import subprocess import tempfile from typing import Dict, List, Optional, Tuple, Union from loguru import logger # 全局变量,存储检测到的硬件加速信息 _FFMPEG_HW_ACCEL_INFO = { "available": False, "type": None, "encoder": None, "hwaccel_args": [], "message": "", "is_dedicated_gpu": False, "fallback_available": False, # 是否有备用方案 "fallback_encoder": None, # 备用编码器 "platform": None, # 平台信息 "gpu_vendor": None, # GPU厂商 "tested_methods": [] # 已测试的方法 } # 硬件加速优先级配置(按平台和GPU类型) HWACCEL_PRIORITY = { "windows": { "nvidia": ["cuda", "nvenc", "d3d11va", "dxva2"], "amd": ["d3d11va", "dxva2", "amf"], # 不再完全禁用AMD "intel": ["qsv", "d3d11va", "dxva2"], "unknown": ["d3d11va", "dxva2"] }, "darwin": { "apple": ["videotoolbox"], "nvidia": ["cuda", "videotoolbox"], "amd": ["videotoolbox"], "intel": ["videotoolbox"], "unknown": ["videotoolbox"] }, "linux": { "nvidia": ["cuda", "nvenc", "vaapi"], "amd": ["vaapi", "amf"], "intel": ["qsv", "vaapi"], "unknown": ["vaapi"] } } # 编码器映射 ENCODER_MAPPING = { "cuda": "h264_nvenc", "nvenc": "h264_nvenc", "videotoolbox": "h264_videotoolbox", "qsv": "h264_qsv", "vaapi": "h264_vaapi", "amf": "h264_amf", "d3d11va": "libx264", # D3D11VA只用于解码 "dxva2": "libx264", # DXVA2只用于解码 "software": "libx264" } def get_null_input() -> str: """ 获取平台特定的空输入文件路径 Returns: str: 平台特定的空输入路径 """ system = platform.system().lower() if system == "windows": return "NUL" else: return "/dev/null" def create_test_video() -> str: """ 创建一个临时的测试视频文件,用于硬件加速测试 Returns: str: 临时测试视频文件路径 """ try: # 创建临时文件 temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) temp_path = temp_file.name temp_file.close() # 生成一个简单的测试视频(1秒,黑色画面) cmd = [ 'ffmpeg', '-y', '-f', 'lavfi', '-i', 'color=black:size=320x240:duration=1', '-c:v', 'libx264', '-pix_fmt', 'yuv420p', '-t', '1', temp_path ] subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return temp_path except Exception as e: logger.debug(f"创建测试视频失败: {str(e)}") return get_null_input() def cleanup_test_video(path: str) -> None: """ 清理测试视频文件 Args: path: 测试视频文件路径 """ try: if path != get_null_input() and os.path.exists(path): os.unlink(path) except Exception as e: logger.debug(f"清理测试视频失败: {str(e)}") def check_ffmpeg_installation() -> bool: """ 检查ffmpeg是否已安装 Returns: bool: 如果安装则返回True,否则返回False """ try: # 在Windows系统上使用UTF-8编码 is_windows = os.name == 'nt' if is_windows: subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', check=True) else: subprocess.run(['ffmpeg', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) return True except (subprocess.SubprocessError, FileNotFoundError): logger.error("ffmpeg未安装或不在系统PATH中,请安装ffmpeg") return False def detect_gpu_vendor() -> str: """ 检测GPU厂商 Returns: str: GPU厂商 (nvidia, amd, intel, apple, unknown) """ system = platform.system().lower() try: if system == "windows": gpu_info = _get_windows_gpu_info().lower() if 'nvidia' in gpu_info or 'geforce' in gpu_info or 'quadro' in gpu_info: return "nvidia" elif 'amd' in gpu_info or 'radeon' in gpu_info: return "amd" elif 'intel' in gpu_info: return "intel" elif system == "darwin": # macOS上检查是否为Apple Silicon if platform.machine().lower() in ['arm64', 'aarch64']: return "apple" else: # Intel Mac,可能有独立显卡 gpu_info = _get_macos_gpu_info().lower() if 'nvidia' in gpu_info: return "nvidia" elif 'amd' in gpu_info or 'radeon' in gpu_info: return "amd" else: return "intel" elif system == "linux": gpu_info = _get_linux_gpu_info().lower() if 'nvidia' in gpu_info: return "nvidia" elif 'amd' in gpu_info or 'radeon' in gpu_info: return "amd" elif 'intel' in gpu_info: return "intel" except Exception as e: logger.debug(f"检测GPU厂商失败: {str(e)}") return "unknown" def test_hwaccel_method(method: str, test_input: str) -> bool: """ 测试特定的硬件加速方法 Args: method: 硬件加速方法名称 test_input: 测试输入文件路径 Returns: bool: 是否支持该方法 """ try: # 构建测试命令 cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error"] # 添加硬件加速参数 if method == "cuda": cmd.extend(["-hwaccel", "cuda", "-hwaccel_output_format", "cuda"]) elif method == "nvenc": cmd.extend(["-hwaccel", "cuda"]) elif method == "videotoolbox": cmd.extend(["-hwaccel", "videotoolbox"]) elif method == "qsv": cmd.extend(["-hwaccel", "qsv"]) elif method == "vaapi": # 尝试找到VAAPI设备 render_device = _find_vaapi_device() if render_device: cmd.extend(["-hwaccel", "vaapi", "-vaapi_device", render_device]) else: cmd.extend(["-hwaccel", "vaapi"]) elif method == "d3d11va": cmd.extend(["-hwaccel", "d3d11va"]) elif method == "dxva2": cmd.extend(["-hwaccel", "dxva2"]) elif method == "amf": cmd.extend(["-hwaccel", "auto"]) # AMF通常通过auto检测 else: return False # 添加输入和输出 cmd.extend(["-i", test_input, "-f", "null", "-t", "0.1", "-"]) # 执行测试 result = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False, timeout=10 # 10秒超时 ) success = result.returncode == 0 if success: logger.debug(f"硬件加速方法 {method} 测试成功") else: logger.debug(f"硬件加速方法 {method} 测试失败: {result.stderr[:200]}") return success except subprocess.TimeoutExpired: logger.debug(f"硬件加速方法 {method} 测试超时") return False except Exception as e: logger.debug(f"硬件加速方法 {method} 测试异常: {str(e)}") return False def detect_hardware_acceleration() -> Dict[str, Union[bool, str, List[str], None]]: """ 检测系统可用的硬件加速器,使用渐进式检测和智能降级 Returns: Dict: 包含硬件加速信息的字典 """ global _FFMPEG_HW_ACCEL_INFO # 如果已经检测过,直接返回结果 if _FFMPEG_HW_ACCEL_INFO["type"] is not None: return _FFMPEG_HW_ACCEL_INFO # 检查ffmpeg是否已安装 if not check_ffmpeg_installation(): _FFMPEG_HW_ACCEL_INFO["message"] = "FFmpeg未安装或不在系统PATH中" return _FFMPEG_HW_ACCEL_INFO # 检测平台和GPU信息 system = platform.system().lower() gpu_vendor = detect_gpu_vendor() _FFMPEG_HW_ACCEL_INFO["platform"] = system _FFMPEG_HW_ACCEL_INFO["gpu_vendor"] = gpu_vendor logger.debug(f"检测硬件加速 - 平台: {system}, GPU厂商: {gpu_vendor}") # 获取FFmpeg支持的硬件加速器列表 try: hwaccels_cmd = subprocess.run( ['ffmpeg', '-hide_banner', '-hwaccels'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False ) supported_hwaccels = hwaccels_cmd.stdout.lower() if hwaccels_cmd.returncode == 0 else "" logger.debug(f"FFmpeg支持的硬件加速器: {supported_hwaccels}") except Exception as e: logger.warning(f"获取FFmpeg硬件加速器列表失败: {str(e)}") supported_hwaccels = "" # 创建测试输入 test_input = create_test_video() try: # 根据平台和GPU厂商获取优先级列表 priority_list = HWACCEL_PRIORITY.get(system, {}).get(gpu_vendor, []) if not priority_list: priority_list = HWACCEL_PRIORITY.get(system, {}).get("unknown", []) logger.debug(f"硬件加速测试优先级: {priority_list}") # 按优先级测试硬件加速方法 for method in priority_list: # 检查FFmpeg是否支持该方法 if method not in supported_hwaccels and method != "nvenc": # nvenc可能不在hwaccels列表中 logger.debug(f"跳过不支持的硬件加速方法: {method}") continue _FFMPEG_HW_ACCEL_INFO["tested_methods"].append(method) if test_hwaccel_method(method, test_input): # 找到可用的硬件加速方法 _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = method _FFMPEG_HW_ACCEL_INFO["encoder"] = ENCODER_MAPPING.get(method, "libx264") # 构建硬件加速参数 if method == "cuda": _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda", "-hwaccel_output_format", "cuda"] elif method == "nvenc": _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda"] elif method == "videotoolbox": _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "videotoolbox"] elif method == "qsv": _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "qsv"] elif method == "vaapi": render_device = _find_vaapi_device() if render_device: _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "vaapi", "-vaapi_device", render_device] else: _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "vaapi"] elif method in ["d3d11va", "dxva2"]: _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", method] elif method == "amf": _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "auto"] # 判断是否为独立GPU _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = gpu_vendor in ["nvidia", "amd"] or (gpu_vendor == "intel" and "arc" in _get_gpu_info().lower()) _FFMPEG_HW_ACCEL_INFO["message"] = f"使用 {method} 硬件加速 ({gpu_vendor} GPU)" logger.debug(f"硬件加速检测成功: {method} ({gpu_vendor})") break # 如果没有找到硬件加速,设置软件编码作为备用 if not _FFMPEG_HW_ACCEL_INFO["available"]: _FFMPEG_HW_ACCEL_INFO["fallback_available"] = True _FFMPEG_HW_ACCEL_INFO["fallback_encoder"] = "libx264" _FFMPEG_HW_ACCEL_INFO["message"] = f"未找到可用的硬件加速,将使用软件编码 (平台: {system}, GPU: {gpu_vendor})" logger.debug("未检测到硬件加速,将使用软件编码") finally: # 清理测试文件 cleanup_test_video(test_input) return _FFMPEG_HW_ACCEL_INFO def _get_gpu_info() -> str: """ 获取GPU信息的统一接口 Returns: str: GPU信息字符串 """ system = platform.system().lower() if system == "windows": return _get_windows_gpu_info() elif system == "darwin": return _get_macos_gpu_info() elif system == "linux": return _get_linux_gpu_info() else: return "unknown" def _get_macos_gpu_info() -> str: """ 获取macOS系统的GPU信息 Returns: str: GPU信息字符串 """ try: # 使用system_profiler获取显卡信息 result = subprocess.run( ['system_profiler', 'SPDisplaysDataType'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False ) if result.returncode == 0: return result.stdout # 备用方法:检查是否为Apple Silicon if platform.machine().lower() in ['arm64', 'aarch64']: return "Apple Silicon GPU" else: return "Intel Mac GPU" except Exception as e: logger.debug(f"获取macOS GPU信息失败: {str(e)}") return "unknown" def _find_vaapi_device() -> Optional[str]: """ 查找可用的VAAPI设备 Returns: Optional[str]: VAAPI设备路径,如果没有找到则返回None """ try: # 常见的VAAPI设备路径 possible_devices = [ "/dev/dri/renderD128", "/dev/dri/renderD129", "/dev/dri/card0", "/dev/dri/card1" ] for device in possible_devices: if os.path.exists(device): # 测试设备是否可用 test_cmd = subprocess.run( ["ffmpeg", "-hide_banner", "-loglevel", "error", "-hwaccel", "vaapi", "-vaapi_device", device, "-f", "lavfi", "-i", "color=black:size=64x64:duration=0.1", "-f", "null", "-"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False ) if test_cmd.returncode == 0: logger.debug(f"找到可用的VAAPI设备: {device}") return device logger.debug("未找到可用的VAAPI设备") return None except Exception as e: logger.debug(f"查找VAAPI设备失败: {str(e)}") return None def _detect_macos_acceleration(supported_hwaccels: str) -> None: """ 检测macOS系统的硬件加速 Args: supported_hwaccels: FFmpeg支持的硬件加速器列表 """ global _FFMPEG_HW_ACCEL_INFO if 'videotoolbox' in supported_hwaccels: # 测试videotoolbox try: test_cmd = subprocess.run( ["ffmpeg", "-hwaccel", "videotoolbox", "-i", "/dev/null", "-f", "null", "-"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False ) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "videotoolbox" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_videotoolbox" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "videotoolbox"] # macOS的Metal GPU加速通常是集成GPU _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = False return except Exception as e: logger.debug(f"测试videotoolbox失败: {str(e)}") _FFMPEG_HW_ACCEL_INFO["message"] = "macOS系统未检测到可用的videotoolbox硬件加速" def _detect_windows_acceleration(supported_hwaccels: str) -> None: """ 检测Windows系统的硬件加速 - 基于实际测试结果优化 重要发现:CUDA硬件解码在视频裁剪场景下会导致滤镜链错误, 因此优先使用纯NVENC编码器方案,既保证性能又确保兼容性。 Args: supported_hwaccels: FFmpeg支持的硬件加速器列表 """ global _FFMPEG_HW_ACCEL_INFO # 在Windows上,首先检查显卡信息 gpu_info = _get_windows_gpu_info() logger.debug(f"Windows GPU信息: {gpu_info}") # 检查是否为Intel集成显卡 is_intel_integrated = False if 'intel' in gpu_info.lower() and ('hd graphics' in gpu_info.lower() or 'uhd graphics' in gpu_info.lower()): logger.info("检测到Intel集成显卡") is_intel_integrated = True # 1. 优先检测NVIDIA硬件加速 - 基于实际测试的最佳方案 if 'nvidia' in gpu_info.lower() or 'geforce' in gpu_info.lower() or 'quadro' in gpu_info.lower(): logger.info("检测到NVIDIA显卡,开始测试硬件加速") # 检查NVENC编码器是否可用 try: encoders_cmd = subprocess.run( ["ffmpeg", "-hide_banner", "-encoders"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False ) has_nvenc = "h264_nvenc" in encoders_cmd.stdout.lower() logger.debug(f"NVENC编码器检测结果: {'可用' if has_nvenc else '不可用'}") if has_nvenc: # 优先方案:纯NVENC编码器(测试证明最兼容) logger.debug("测试纯NVENC编码器(推荐方案,避免滤镜链问题)") test_cmd = subprocess.run([ "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "lavfi", "-i", "testsrc=duration=0.1:size=640x480:rate=30", "-c:v", "h264_nvenc", "-preset", "medium", "-cq", "23", "-pix_fmt", "yuv420p", "-f", "null", "-" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "nvenc" # 使用nvenc类型标识纯编码器 _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = [] # 不使用硬件解码参数 _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True _FFMPEG_HW_ACCEL_INFO["message"] = "纯NVENC编码器(最佳兼容性)" logger.info("✓ 纯NVENC编码器测试成功") return # 备用方案:如果需要的话,可以测试CUDA硬件解码(但不推荐用于视频裁剪) if 'cuda' in supported_hwaccels: logger.debug("测试CUDA硬件解码(仅用于非裁剪场景)") test_cmd = subprocess.run([ "ffmpeg", "-hide_banner", "-loglevel", "error", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda", "-f", "lavfi", "-i", "testsrc=duration=0.1:size=640x480:rate=30", "-c:v", "h264_nvenc", "-preset", "medium", "-cq", "23", "-pix_fmt", "yuv420p", "-f", "null", "-" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "cuda" # 保留cuda类型用于特殊场景 _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda", "-hwaccel_output_format", "cuda"] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True _FFMPEG_HW_ACCEL_INFO["message"] = "CUDA+NVENC(限特殊场景使用)" _FFMPEG_HW_ACCEL_INFO["fallback_available"] = True _FFMPEG_HW_ACCEL_INFO["fallback_encoder"] = "h264_nvenc" logger.info("✓ CUDA+NVENC硬件加速测试成功(备用方案)") return except Exception as e: logger.debug(f"NVIDIA硬件加速测试失败: {str(e)}") # 2. 检测AMD硬件加速 if 'amd' in gpu_info.lower() or 'radeon' in gpu_info.lower(): logger.info("检测到AMD显卡,开始测试硬件加速") # 检查AMF编码器是否可用 try: encoders_cmd = subprocess.run( ["ffmpeg", "-hide_banner", "-encoders"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False ) has_amf = "h264_amf" in encoders_cmd.stdout.lower() logger.debug(f"AMF编码器检测结果: {'可用' if has_amf else '不可用'}") if has_amf: # 测试AMF编码器 logger.debug("测试AMF编码器") test_cmd = subprocess.run([ "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "lavfi", "-i", "testsrc=duration=0.1:size=640x480:rate=30", "-c:v", "h264_amf", "-quality", "balanced", "-qp_i", "23", "-pix_fmt", "yuv420p", "-f", "null", "-" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "amf" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_amf" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = [] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True _FFMPEG_HW_ACCEL_INFO["message"] = "AMD AMF编码器" logger.info("✓ AMD AMF编码器测试成功") return except Exception as e: logger.debug(f"AMD硬件加速测试失败: {str(e)}") # 3. 检测Intel硬件加速 if 'intel' in gpu_info.lower() and 'qsv' in supported_hwaccels: logger.info("检测到Intel显卡,开始测试硬件加速") try: encoders_cmd = subprocess.run( ["ffmpeg", "-hide_banner", "-encoders"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False ) has_qsv = "h264_qsv" in encoders_cmd.stdout.lower() logger.debug(f"QSV编码器检测结果: {'可用' if has_qsv else '不可用'}") if has_qsv: # 测试QSV编码器 logger.debug("测试QSV编码器") test_cmd = subprocess.run([ "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "lavfi", "-i", "testsrc=duration=0.1:size=640x480:rate=30", "-c:v", "h264_qsv", "-preset", "medium", "-global_quality", "23", "-pix_fmt", "yuv420p", "-f", "null", "-" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "qsv" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_qsv" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = [] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = not is_intel_integrated _FFMPEG_HW_ACCEL_INFO["message"] = "Intel QSV编码器" logger.info("✓ Intel QSV编码器测试成功") return except Exception as e: logger.debug(f"Intel硬件加速测试失败: {str(e)}") # 4. 如果没有硬件编码器,使用软件编码 logger.info("未检测到可用的硬件编码器,使用软件编码") _FFMPEG_HW_ACCEL_INFO["available"] = False _FFMPEG_HW_ACCEL_INFO["type"] = "software" _FFMPEG_HW_ACCEL_INFO["encoder"] = "libx264" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = [] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = False _FFMPEG_HW_ACCEL_INFO["message"] = "使用软件编码" def _detect_linux_acceleration(supported_hwaccels: str) -> None: """ 检测Linux系统的硬件加速 Args: supported_hwaccels: FFmpeg支持的硬件加速器列表 """ global _FFMPEG_HW_ACCEL_INFO # 获取Linux显卡信息 gpu_info = _get_linux_gpu_info() is_nvidia = 'nvidia' in gpu_info.lower() is_intel = 'intel' in gpu_info.lower() is_amd = 'amd' in gpu_info.lower() or 'radeon' in gpu_info.lower() # 检测NVIDIA CUDA支持 if 'cuda' in supported_hwaccels and is_nvidia: try: test_cmd = subprocess.run( ["ffmpeg", "-hwaccel", "cuda", "-i", "/dev/null", "-f", "null", "-"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False ) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "cuda" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "cuda"] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True return except Exception as e: logger.debug(f"测试CUDA失败: {str(e)}") # 检测VAAPI支持 if 'vaapi' in supported_hwaccels: # 检查是否存在渲染设备 render_devices = ['/dev/dri/renderD128', '/dev/dri/renderD129'] render_device = None for device in render_devices: if os.path.exists(device): render_device = device break if render_device: try: test_cmd = subprocess.run( ["ffmpeg", "-hwaccel", "vaapi", "-vaapi_device", render_device, "-i", "/dev/null", "-f", "null", "-"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False ) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "vaapi" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_vaapi" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "vaapi", "-vaapi_device", render_device] # 根据显卡类型判断是否为独立显卡 _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = is_nvidia or (is_amd and not is_intel) return except Exception as e: logger.debug(f"测试VAAPI失败: {str(e)}") # 检测Intel QSV支持 if 'qsv' in supported_hwaccels and is_intel: try: test_cmd = subprocess.run( ["ffmpeg", "-hwaccel", "qsv", "-i", "/dev/null", "-f", "null", "-"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, check=False ) if test_cmd.returncode == 0: _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "qsv" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_qsv" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = ["-hwaccel", "qsv"] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = False # Intel QSV通常是集成GPU return except Exception as e: logger.debug(f"测试QSV失败: {str(e)}") _FFMPEG_HW_ACCEL_INFO["message"] = f"Linux系统未检测到可用的硬件加速,显卡信息: {gpu_info}" def _get_windows_gpu_info() -> str: """ 获取Windows系统的显卡信息 Returns: str: 显卡信息字符串 """ try: # 使用PowerShell获取更可靠的显卡信息,并使用UTF-8编码 gpu_info = subprocess.run( ['powershell', '-Command', "Get-WmiObject Win32_VideoController | Select-Object Name | Format-List"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', text=True, check=False ) # 如果PowerShell失败,尝试使用wmic if not gpu_info.stdout.strip(): gpu_info = subprocess.run( ['wmic', 'path', 'win32_VideoController', 'get', 'name'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', text=True, check=False ) # 记录详细的显卡信息以便调试 logger.debug(f"Windows显卡信息: {gpu_info.stdout}") return gpu_info.stdout except Exception as e: logger.warning(f"获取Windows显卡信息失败: {str(e)}") return "Unknown GPU" def _get_linux_gpu_info() -> str: """ 获取Linux系统的显卡信息 Returns: str: 显卡信息字符串 """ try: # 尝试使用lspci命令 gpu_info = subprocess.run( ['lspci', '-v', '-nn', '|', 'grep', '-i', 'vga\\|display'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, check=False ) if gpu_info.stdout: return gpu_info.stdout # 如果lspci命令失败,尝试使用glxinfo gpu_info = subprocess.run( ['glxinfo', '|', 'grep', '-i', 'vendor\\|renderer'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, check=False ) if gpu_info.stdout: return gpu_info.stdout return "Unknown GPU" except Exception as e: logger.warning(f"获取Linux显卡信息失败: {str(e)}") return "Unknown GPU" def get_ffmpeg_hwaccel_args() -> List[str]: """ 获取FFmpeg硬件加速参数 Returns: List[str]: FFmpeg硬件加速参数列表 """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] def get_ffmpeg_hwaccel_type() -> Optional[str]: """ 获取FFmpeg硬件加速类型 Returns: Optional[str]: 硬件加速类型,如果不支持则返回None """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO["type"] if _FFMPEG_HW_ACCEL_INFO["available"] else None def get_ffmpeg_hwaccel_encoder() -> Optional[str]: """ 获取FFmpeg硬件加速编码器 Returns: Optional[str]: 硬件加速编码器,如果不支持则返回None """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO["encoder"] if _FFMPEG_HW_ACCEL_INFO["available"] else None def get_ffmpeg_hwaccel_info() -> Dict[str, Union[bool, str, List[str], None]]: """ 获取FFmpeg硬件加速信息 Returns: Dict: 包含硬件加速信息的字典 """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO def is_ffmpeg_hwaccel_available() -> bool: """ 检查是否有可用的FFmpeg硬件加速 Returns: bool: 如果有可用的硬件加速则返回True,否则返回False """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO["available"] def is_dedicated_gpu() -> bool: """ 检查是否使用独立显卡进行硬件加速 Returns: bool: 如果使用独立显卡则返回True,否则返回False """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() return _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] def get_optimal_ffmpeg_encoder() -> str: """ 获取最优的FFmpeg编码器 Returns: str: 编码器名称 """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() if _FFMPEG_HW_ACCEL_INFO["available"]: return _FFMPEG_HW_ACCEL_INFO["encoder"] elif _FFMPEG_HW_ACCEL_INFO["fallback_available"]: return _FFMPEG_HW_ACCEL_INFO["fallback_encoder"] else: return "libx264" # 默认软件编码器 def get_ffmpeg_command_with_hwaccel(input_path: str, output_path: str, **kwargs) -> List[str]: """ 生成带有硬件加速的FFmpeg命令 Args: input_path: 输入文件路径 output_path: 输出文件路径 **kwargs: 其他FFmpeg参数 Returns: List[str]: FFmpeg命令列表 """ # 如果还没有检测过,先进行检测 if _FFMPEG_HW_ACCEL_INFO["type"] is None: detect_hardware_acceleration() cmd = ["ffmpeg", "-y"] # 添加硬件加速参数 if _FFMPEG_HW_ACCEL_INFO["available"]: cmd.extend(_FFMPEG_HW_ACCEL_INFO["hwaccel_args"]) # 添加输入文件 cmd.extend(["-i", input_path]) # 添加编码器 encoder = get_optimal_ffmpeg_encoder() cmd.extend(["-c:v", encoder]) # 添加其他参数 for key, value in kwargs.items(): if key.startswith("_"): # 跳过内部参数 continue if isinstance(value, list): cmd.extend(value) else: cmd.extend([f"-{key}", str(value)]) # 添加输出文件 cmd.append(output_path) return cmd def test_ffmpeg_compatibility() -> Dict[str, any]: """ 测试FFmpeg兼容性并返回详细报告 Returns: Dict: 兼容性测试报告 """ report = { "ffmpeg_installed": False, "platform": platform.system().lower(), "gpu_vendor": "unknown", "hardware_acceleration": { "available": False, "type": None, "encoder": None, "tested_methods": [] }, "software_fallback": { "available": False, "encoder": "libx264" }, "recommendations": [] } # 检查FFmpeg安装 report["ffmpeg_installed"] = check_ffmpeg_installation() if not report["ffmpeg_installed"]: report["recommendations"].append("请安装FFmpeg并确保其在系统PATH中") return report # 检测硬件加速 hwaccel_info = detect_hardware_acceleration() report["gpu_vendor"] = hwaccel_info.get("gpu_vendor", "unknown") report["hardware_acceleration"]["available"] = hwaccel_info.get("available", False) report["hardware_acceleration"]["type"] = hwaccel_info.get("type") report["hardware_acceleration"]["encoder"] = hwaccel_info.get("encoder") report["hardware_acceleration"]["tested_methods"] = hwaccel_info.get("tested_methods", []) # 检查软件备用方案 report["software_fallback"]["available"] = hwaccel_info.get("fallback_available", True) report["software_fallback"]["encoder"] = hwaccel_info.get("fallback_encoder", "libx264") # 生成建议 if not report["hardware_acceleration"]["available"]: if report["gpu_vendor"] == "nvidia": report["recommendations"].append("建议安装NVIDIA驱动和CUDA工具包以启用硬件加速") elif report["gpu_vendor"] == "amd": report["recommendations"].append("AMD显卡硬件加速支持有限,建议使用软件编码") elif report["gpu_vendor"] == "intel": report["recommendations"].append("建议更新Intel显卡驱动以启用QSV硬件加速") else: report["recommendations"].append("未检测到支持的GPU,将使用软件编码") return report def force_software_encoding() -> None: """ 强制使用软件编码,禁用硬件加速 """ global _FFMPEG_HW_ACCEL_INFO _FFMPEG_HW_ACCEL_INFO.update({ "available": False, "type": "software", "encoder": "libx264", "hwaccel_args": [], "message": "强制使用软件编码", "is_dedicated_gpu": False, "fallback_available": True, "fallback_encoder": "libx264" }) logger.info("已强制切换到软件编码模式") def reset_hwaccel_detection() -> None: """ 重置硬件加速检测结果,强制重新检测 这在以下情况下很有用: 1. 驱动程序更新后 2. 系统配置改变后 3. 需要重新测试硬件加速时 """ global _FFMPEG_HW_ACCEL_INFO logger.info("🔄 重置硬件加速检测,将重新检测...") _FFMPEG_HW_ACCEL_INFO = { "available": False, "type": None, "encoder": None, "hwaccel_args": [], "message": "", "is_dedicated_gpu": False, "fallback_available": False, "fallback_encoder": None, "platform": None, "gpu_vendor": None, "tested_methods": [] } def test_nvenc_directly() -> bool: """ 直接测试NVENC编码器是否可用(无硬件解码) Returns: bool: NVENC是否可用 """ try: logger.info("🧪 直接测试NVENC编码器...") # 测试纯NVENC编码器 test_cmd = subprocess.run([ "ffmpeg", "-hide_banner", "-loglevel", "error", "-f", "lavfi", "-i", "testsrc=duration=1:size=640x480:rate=30", "-c:v", "h264_nvenc", "-preset", "fast", "-profile:v", "main", "-pix_fmt", "yuv420p", "-t", "1", "-f", "null", "-" ], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8', text=True, check=False) if test_cmd.returncode == 0: logger.info("✅ NVENC编码器测试成功!") return True else: logger.warning(f"❌ NVENC编码器测试失败: {test_cmd.stderr}") return False except Exception as e: logger.error(f"NVENC测试异常: {str(e)}") return False def force_use_nvenc_pure() -> None: """ 强制使用纯NVENC编码器模式 当自动检测失败但你确定NVENC可用时使用 """ global _FFMPEG_HW_ACCEL_INFO logger.info("🎯 强制启用纯NVENC编码器模式...") # 先测试NVENC是否真的可用 if test_nvenc_directly(): _FFMPEG_HW_ACCEL_INFO["available"] = True _FFMPEG_HW_ACCEL_INFO["type"] = "nvenc_pure" _FFMPEG_HW_ACCEL_INFO["encoder"] = "h264_nvenc" _FFMPEG_HW_ACCEL_INFO["hwaccel_args"] = [] _FFMPEG_HW_ACCEL_INFO["is_dedicated_gpu"] = True _FFMPEG_HW_ACCEL_INFO["message"] = "强制启用纯NVENC编码器" logger.info("✅ 已强制启用纯NVENC编码器模式") else: logger.error("❌ NVENC编码器不可用,无法强制启用") def get_hwaccel_status() -> Dict[str, any]: """ 获取当前硬件加速状态的详细信息 Returns: Dict: 硬件加速状态信息 """ hwaccel_info = get_ffmpeg_hwaccel_info() status = { "available": hwaccel_info.get("available", False), "type": hwaccel_info.get("type", "software"), "encoder": hwaccel_info.get("encoder", "libx264"), "message": hwaccel_info.get("message", ""), "is_dedicated_gpu": hwaccel_info.get("is_dedicated_gpu", False), "platform": platform.system(), "gpu_vendor": detect_gpu_vendor(), "ffmpeg_available": check_ffmpeg_installation() } return status # 自动重置检测(在模块导入时执行) def _auto_reset_on_import(): """模块导入时自动重置硬件加速检测""" try: # 只在平台真正改变时才重置,而不是初始化时 current_platform = platform.system() cached_platform = _FFMPEG_HW_ACCEL_INFO.get("platform") # 只有当已经有缓存的平台信息,且平台改变了,才需要重置 if cached_platform is not None and cached_platform != current_platform: reset_hwaccel_detection() except Exception as e: logger.debug(f"自动重置检测失败: {str(e)}") # 执行自动重置 _auto_reset_on_import() ================================================ FILE: app/utils/gemini_analyzer.py ================================================ import json from typing import List, Union, Dict import os from pathlib import Path from loguru import logger from tqdm import tqdm import asyncio from tenacity import retry, stop_after_attempt, retry_if_exception_type, wait_exponential import requests import PIL.Image import traceback import base64 import io from app.utils import utils class VisionAnalyzer: """原生Gemini视觉分析器类""" def __init__(self, model_name: str = "gemini-2.0-flash-exp", api_key: str = None, base_url: str = None): """初始化视觉分析器""" if not api_key: raise ValueError("必须提供API密钥") self.model_name = model_name self.api_key = api_key self.base_url = base_url or "https://generativelanguage.googleapis.com/v1beta" # 初始化配置 self._configure_client() def _configure_client(self): """配置原生Gemini API客户端""" # 使用原生Gemini REST API self.client = None logger.info(f"配置原生Gemini API,端点: {self.base_url}, 模型: {self.model_name}") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type(requests.exceptions.RequestException) ) async def _generate_content_with_retry(self, prompt, batch): """使用重试机制调用原生Gemini API""" try: return await self._generate_with_gemini_api(prompt, batch) except requests.exceptions.RequestException as e: logger.warning(f"Gemini API请求异常: {str(e)}") raise except Exception as e: logger.error(f"Gemini API生成内容时发生错误: {str(e)}") raise async def _generate_with_gemini_api(self, prompt, batch): """使用原生Gemini REST API生成内容""" # 将PIL图片转换为base64编码 image_parts = [] for img in batch: # 将PIL图片转换为字节流 img_buffer = io.BytesIO() img.save(img_buffer, format='JPEG', quality=85) # 优化图片质量 img_bytes = img_buffer.getvalue() # 转换为base64 img_base64 = base64.b64encode(img_bytes).decode('utf-8') image_parts.append({ "inline_data": { "mime_type": "image/jpeg", "data": img_base64 } }) # 构建符合官方文档的请求数据 request_data = { "contents": [{ "parts": [ {"text": prompt}, *image_parts ] }], "generationConfig": { "temperature": 1.0, "topK": 40, "topP": 0.95, "maxOutputTokens": 8192, "candidateCount": 1, "stopSequences": [] }, "safetySettings": [ { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" } ] } # 构建请求URL url = f"{self.base_url}/models/{self.model_name}:generateContent" # 发送请求 response = await asyncio.to_thread( requests.post, url, json=request_data, headers={ "Content-Type": "application/json", "x-goog-api-key": self.api_key }, timeout=120 # 增加超时时间 ) # 处理HTTP错误 if response.status_code == 429: raise requests.exceptions.RequestException(f"API配额限制: {response.text}") elif response.status_code == 400: raise Exception(f"请求参数错误: {response.text}") elif response.status_code == 403: raise Exception(f"API密钥无效或权限不足: {response.text}") elif response.status_code != 200: raise Exception(f"Gemini API请求失败: {response.status_code} - {response.text}") response_data = response.json() # 检查响应格式 if "candidates" not in response_data or not response_data["candidates"]: raise Exception("Gemini API返回无效响应,可能触发了安全过滤") candidate = response_data["candidates"][0] # 检查是否被安全过滤阻止 if "finishReason" in candidate and candidate["finishReason"] == "SAFETY": raise Exception("内容被Gemini安全过滤器阻止") if "content" not in candidate or "parts" not in candidate["content"]: raise Exception("Gemini API返回内容格式错误") # 提取文本内容 text_content = "" for part in candidate["content"]["parts"]: if "text" in part: text_content += part["text"] if not text_content.strip(): raise Exception("Gemini API返回空内容") # 创建兼容的响应对象 class CompatibleResponse: def __init__(self, text): self.text = text return CompatibleResponse(text_content) async def analyze_images(self, images: Union[List[str], List[PIL.Image.Image]], prompt: str, batch_size: int) -> List[Dict]: """批量分析多张图片""" try: # 加载图片 if isinstance(images[0], str): images = self.load_images(images) # 验证图片列表 if not images: raise ValueError("图片列表为空") # 验证每个图片对象 valid_images = [] for i, img in enumerate(images): if not isinstance(img, PIL.Image.Image): logger.error(f"无效的图片对象,索引 {i}: {type(img)}") continue valid_images.append(img) if not valid_images: raise ValueError("没有有效的图片对象") images = valid_images results = [] # 视频帧总数除以批量处理大小,如果有小数则+1 batches_needed = len(images) // batch_size if len(images) % batch_size > 0: batches_needed += 1 logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次") with tqdm(total=batches_needed, desc="分析进度") as pbar: for i in range(0, len(images), batch_size): batch = images[i:i + batch_size] retry_count = 0 while retry_count < 3: try: # 在每个批次处理前添加小延迟 # if i > 0: # await asyncio.sleep(2) # 确保每个批次的图片都是有效的 valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)] if not valid_batch: raise ValueError(f"批次 {i // batch_size} 中没有有效的图片") response = await self._generate_content_with_retry(prompt, valid_batch) results.append({ 'batch_index': i // batch_size, 'images_processed': len(valid_batch), 'response': response.text, 'model_used': self.model_name }) break except Exception as e: retry_count += 1 error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}" logger.error(error_msg) if retry_count >= 3: results.append({ 'batch_index': i // batch_size, 'images_processed': len(batch), 'error': error_msg, 'model_used': self.model_name }) else: logger.info(f"批次 {i // batch_size} 处理失败,等待60秒后重试当前批次...") await asyncio.sleep(60) pbar.update(1) return results except Exception as e: error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}" logger.error(error_msg) raise Exception(error_msg) def save_results_to_txt(self, results: List[Dict], output_dir: str): """将分析结果保存到txt文件""" # 确保输出目录存在 os.makedirs(output_dir, exist_ok=True) for result in results: if not result.get('image_paths'): continue response_text = result['response'] image_paths = result['image_paths'] # 从文件名中提取时间戳并转换为标准格式 def format_timestamp(img_path): # 从文件名中提取时间部分 timestamp = Path(img_path).stem.split('_')[-1] try: # 将时间转换为秒 seconds = utils.time_to_seconds(timestamp.replace('_', ':')) # 转换为 HH:MM:SS,mmm 格式 hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) seconds_remainder = seconds % 60 whole_seconds = int(seconds_remainder) milliseconds = int((seconds_remainder - whole_seconds) * 1000) return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" except Exception as e: logger.error(f"时间戳格式转换错误: {timestamp}, {str(e)}") return timestamp start_timestamp = format_timestamp(image_paths[0]) end_timestamp = format_timestamp(image_paths[-1]) txt_path = os.path.join(output_dir, f"frame_{start_timestamp}_{end_timestamp}.txt") # 保存结果到txt文件 with open(txt_path, 'w', encoding='utf-8') as f: f.write(response_text.strip()) logger.info(f"已保存分析结果到: {txt_path}") def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]: """ 加载多张图片 Args: image_paths: 图片路径列表 Returns: 加载后的PIL Image对象列表 """ images = [] failed_images = [] for img_path in image_paths: try: if not os.path.exists(img_path): logger.error(f"图片文件不存在: {img_path}") failed_images.append(img_path) continue img = PIL.Image.open(img_path) # 确保图片被完全加载 img.load() # 转换为RGB模式 if img.mode != 'RGB': img = img.convert('RGB') images.append(img) except Exception as e: logger.error(f"无法加载图片 {img_path}: {str(e)}") failed_images.append(img_path) if failed_images: logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}") if not images: raise ValueError("没有成功加载任何图片") return images ================================================ FILE: app/utils/gemini_openai_analyzer.py ================================================ """ OpenAI兼容的Gemini视觉分析器 使用标准OpenAI格式调用Gemini代理服务 """ import json from typing import List, Union, Dict import os from pathlib import Path from loguru import logger from tqdm import tqdm import asyncio from tenacity import retry, stop_after_attempt, retry_if_exception_type, wait_exponential import requests import PIL.Image import traceback import base64 import io from app.utils import utils class GeminiOpenAIAnalyzer: """OpenAI兼容的Gemini视觉分析器类""" def __init__(self, model_name: str = "gemini-2.0-flash-exp", api_key: str = None, base_url: str = None): """初始化OpenAI兼容的Gemini分析器""" if not api_key: raise ValueError("必须提供API密钥") if not base_url: raise ValueError("必须提供OpenAI兼容的代理端点URL") self.model_name = model_name self.api_key = api_key self.base_url = base_url.rstrip('/') # 初始化OpenAI客户端 self._configure_client() def _configure_client(self): """配置OpenAI兼容的客户端""" from openai import OpenAI self.client = OpenAI( api_key=self.api_key, base_url=self.base_url ) logger.info(f"配置OpenAI兼容Gemini代理,端点: {self.base_url}, 模型: {self.model_name}") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type((requests.exceptions.RequestException, Exception)) ) async def _generate_content_with_retry(self, prompt, batch): """使用重试机制调用OpenAI兼容的Gemini代理""" try: return await self._generate_with_openai_api(prompt, batch) except Exception as e: logger.warning(f"OpenAI兼容Gemini代理请求异常: {str(e)}") raise async def _generate_with_openai_api(self, prompt, batch): """使用OpenAI兼容接口生成内容""" # 将PIL图片转换为base64编码 image_contents = [] for img in batch: # 将PIL图片转换为字节流 img_buffer = io.BytesIO() img.save(img_buffer, format='JPEG', quality=85) img_bytes = img_buffer.getvalue() # 转换为base64 img_base64 = base64.b64encode(img_bytes).decode('utf-8') image_contents.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{img_base64}" } }) # 构建OpenAI格式的消息 messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, *image_contents ] } ] # 调用OpenAI兼容接口 response = await asyncio.to_thread( self.client.chat.completions.create, model=self.model_name, messages=messages, max_tokens=4000, temperature=1.0 ) # 创建兼容的响应对象 class CompatibleResponse: def __init__(self, text): self.text = text return CompatibleResponse(response.choices[0].message.content) async def analyze_images(self, images: List[Union[str, Path, PIL.Image.Image]], prompt: str, batch_size: int = 10) -> List[str]: """ 分析图片并返回结果 Args: images: 图片路径列表或PIL图片对象列表 prompt: 分析提示词 batch_size: 批处理大小 Returns: 分析结果列表 """ logger.info(f"开始分析 {len(images)} 张图片,使用OpenAI兼容Gemini代理") # 加载图片 loaded_images = [] for img in images: if isinstance(img, (str, Path)): try: pil_img = PIL.Image.open(img) # 调整图片大小以优化性能 if pil_img.size[0] > 1024 or pil_img.size[1] > 1024: pil_img.thumbnail((1024, 1024), PIL.Image.Resampling.LANCZOS) loaded_images.append(pil_img) except Exception as e: logger.error(f"加载图片失败 {img}: {str(e)}") continue elif isinstance(img, PIL.Image.Image): loaded_images.append(img) else: logger.warning(f"不支持的图片类型: {type(img)}") continue if not loaded_images: raise ValueError("没有有效的图片可以分析") # 分批处理 results = [] total_batches = (len(loaded_images) + batch_size - 1) // batch_size for i in tqdm(range(0, len(loaded_images), batch_size), desc="分析图片批次", total=total_batches): batch = loaded_images[i:i + batch_size] try: response = await self._generate_content_with_retry(prompt, batch) results.append(response.text) # 添加延迟以避免API限流 if i + batch_size < len(loaded_images): await asyncio.sleep(1) except Exception as e: logger.error(f"分析批次 {i//batch_size + 1} 失败: {str(e)}") results.append(f"分析失败: {str(e)}") logger.info(f"完成图片分析,共处理 {len(results)} 个批次") return results def analyze_images_sync(self, images: List[Union[str, Path, PIL.Image.Image]], prompt: str, batch_size: int = 10) -> List[str]: """ 同步版本的图片分析方法 """ return asyncio.run(self.analyze_images(images, prompt, batch_size)) ================================================ FILE: app/utils/qwenvl_analyzer.py ================================================ import json from typing import List, Union, Dict import os from pathlib import Path from loguru import logger from tqdm import tqdm import asyncio from tenacity import retry, stop_after_attempt, RetryError, wait_exponential from openai import OpenAI import PIL.Image import base64 import io import traceback class QwenAnalyzer: """千问视觉分析器类""" def __init__(self, model_name: str = "qwen-vl-max-latest", api_key: str = None, base_url: str = None): """ 初始化千问视觉分析器 Args: model_name: 模型名称,默认使用 qwen-vl-max-latest api_key: 阿里云API密钥 base_url: API基础URL,如果为None则使用默认值 """ if not api_key: raise ValueError("必须提供API密钥") self.model_name = model_name self.api_key = api_key self.base_url = base_url # 配置API客户端 self._configure_client() def _configure_client(self): """ 配置API客户端 使用最简化的参数配置,避免不必要的参数 """ try: self.client = OpenAI( api_key=self.api_key, base_url=self.base_url ) except Exception as e: logger.error(f"初始化OpenAI客户端失败: {str(e)}") raise def _image_to_base64(self, image: PIL.Image.Image) -> str: """ 将PIL图片对象转换为base64字符串 """ buffered = io.BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10) ) async def _generate_content_with_retry(self, prompt: str, batch: List[PIL.Image.Image]): """使用重试机制的内部方法来调用千问API""" try: # 构建消息内容 content = [] # 添加图片 for img in batch: base64_image = self._image_to_base64(img) content.append({ "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } }) # 添加文本提示 content.append({ "type": "text", "text": prompt % (len(content), len(content), len(content)) }) # 调用API response = await asyncio.to_thread( self.client.chat.completions.create, model=self.model_name, messages=[{ "role": "user", "content": content }] ) return response.choices[0].message.content except Exception as e: logger.error(f"API调用错误: {str(e)}") raise RetryError("API调用失败") async def analyze_images(self, images: Union[List[str], List[PIL.Image.Image]], prompt: str, batch_size: int) -> List[Dict]: """ 批量分析多张图片 Args: images: 图片路径列表或PIL图片对象列表 prompt: 分析提示词 batch_size: 批处理大小 Returns: 分析结果列表 """ try: # 保存原始图片路径(如果是路径列表的话) original_paths = images if isinstance(images[0], str) else None # 加载图片 if isinstance(images[0], str): images = self.load_images(images) # 验证图片列表 if not images: raise ValueError("图片列表为空") # 验证每个图片对象 valid_images = [] valid_paths = [] for i, img in enumerate(images): if not isinstance(img, PIL.Image.Image): logger.error(f"无效的图片对象,索引 {i}: {type(img)}") continue valid_images.append(img) if original_paths: valid_paths.append(original_paths[i]) if not valid_images: raise ValueError("没有有效的图片对象") images = valid_images results = [] # 视频帧总数除以批量处理大小,如果有小数则+1 batches_needed = len(images) // batch_size if len(images) % batch_size > 0: batches_needed += 1 logger.debug(f"视频帧总数:{len(images)}, 每批处理 {batch_size} 帧, 需要访问 VLM {batches_needed} 次") with tqdm(total=batches_needed, desc="分析进度") as pbar: for i in range(0, len(images), batch_size): batch = images[i:i + batch_size] batch_paths = valid_paths[i:i + batch_size] if valid_paths else None retry_count = 0 while retry_count < 3: try: # 在每个批次处理前添加小延迟 # if i > 0: # await asyncio.sleep(0.5) # 确保每个批次的图片都是有效的 valid_batch = [img for img in batch if isinstance(img, PIL.Image.Image)] if not valid_batch: raise ValueError(f"批次 {i // batch_size} 中没有有效的图片") response = await self._generate_content_with_retry(prompt, valid_batch) result_dict = { 'batch_index': i // batch_size, 'images_processed': len(valid_batch), 'response': response, 'model_used': self.model_name } # 添加图片路径信息(如果有的话) if batch_paths: result_dict['image_paths'] = batch_paths results.append(result_dict) break except Exception as e: retry_count += 1 error_msg = f"批次 {i // batch_size} 处理出错: {str(e)}" logger.error(error_msg) if retry_count >= 3: results.append({ 'batch_index': i // batch_size, 'images_processed': len(batch), 'error': error_msg, 'model_used': self.model_name, 'image_paths': batch_paths if batch_paths else [] }) else: logger.info(f"批次 {i // batch_size} 处理失败,等待60秒后重试当前批次...") await asyncio.sleep(60) pbar.update(1) return results except Exception as e: error_msg = f"图片分析过程中发生错误: {str(e)}\n{traceback.format_exc()}" logger.error(error_msg) raise Exception(error_msg) def save_results_to_txt(self, results: List[Dict], output_dir: str): """将分析结果保存到txt文件""" # 确保输出目录存在 os.makedirs(output_dir, exist_ok=True) for i, result in enumerate(results): response_text = result['response'] # 如果有图片路径信息,用它来生成文件名 if result.get('image_paths'): image_paths = result['image_paths'] img_name_start = Path(image_paths[0]).stem.split('_')[-1] img_name_end = Path(image_paths[-1]).stem.split('_')[-1] file_name = f"frame_{img_name_start}_{img_name_end}.txt" else: # 如果没有路径信息,使用批次索引 file_name = f"batch_{result['batch_index']}.txt" txt_path = os.path.join(output_dir, file_name) # 保存结果到txt文件 with open(txt_path, 'w', encoding='utf-8') as f: f.write(response_text.strip()) logger.info(f"已保存分析结果到: {txt_path}") def load_images(self, image_paths: List[str]) -> List[PIL.Image.Image]: """ 加载多张图片 Args: image_paths: 图片路径列表 Returns: 加载后的PIL Image对象列表 """ images = [] failed_images = [] for img_path in image_paths: try: if not os.path.exists(img_path): logger.error(f"图片文件不存在: {img_path}") failed_images.append(img_path) continue img = PIL.Image.open(img_path) # 确保图片被完全加载 img.load() # 转换为RGB模式 if img.mode != 'RGB': img = img.convert('RGB') images.append(img) except Exception as e: logger.error(f"无法加载图片 {img_path}: {str(e)}") failed_images.append(img_path) if failed_images: logger.warning(f"以下图片加载失败:\n{json.dumps(failed_images, indent=2, ensure_ascii=False)}") if not images: raise ValueError("没有成功加载任何图片") return images ================================================ FILE: app/utils/script_generator.py ================================================ import os import json import traceback from loguru import logger # import tiktoken from typing import List, Dict from datetime import datetime from openai import OpenAI import requests import time class BaseGenerator: def __init__(self, model_name: str, api_key: str, prompt: str): self.model_name = model_name self.api_key = api_key self.base_prompt = prompt self.conversation_history = [] self.chunk_overlap = 50 self.last_chunk_ending = "" self.default_params = { "temperature": 0.7, "max_tokens": 500, "top_p": 0.9, "frequency_penalty": 0.3, "presence_penalty": 0.5 } def _try_generate(self, messages: list, params: dict = None) -> str: max_attempts = 3 tolerance = 5 for attempt in range(max_attempts): try: response = self._generate(messages, params or self.default_params) return self._process_response(response) except Exception as e: if attempt == max_attempts - 1: raise logger.warning(f"Generation attempt {attempt + 1} failed: {str(e)}") continue return "" def _generate(self, messages: list, params: dict) -> any: raise NotImplementedError def _process_response(self, response: any) -> str: return response def generate_script(self, scene_description: str, word_count: int) -> str: """生成脚本的通用方法""" prompt = f"""{self.base_prompt} 上一段文案的结尾:{self.last_chunk_ending if self.last_chunk_ending else "这是第一段,无需考虑上文"} 当前画面描述:{scene_description} 请确保新生成的文案与上文自然衔接,保持叙事的连贯性和趣味性。 不要出现除了文案以外的其他任何内容; 严格字数要求:{word_count}字,允许误差±5字。""" messages = [ {"role": "system", "content": self.base_prompt}, {"role": "user", "content": prompt} ] try: generated_script = self._try_generate(messages, self.default_params) # 更新上下文 if generated_script: self.last_chunk_ending = generated_script[-self.chunk_overlap:] if len( generated_script) > self.chunk_overlap else generated_script return generated_script except Exception as e: logger.error(f"Script generation failed: {str(e)}") raise class OpenAIGenerator(BaseGenerator): """OpenAI API 生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str): super().__init__(model_name, api_key, prompt) base_url = base_url or f"https://api.openai.com/v1" self.client = OpenAI(api_key=api_key, base_url=base_url) self.max_tokens = 5000 # OpenAI特定参数 self.default_params = { **self.default_params, "stream": False, "user": "script_generator" } # # 初始化token计数器 # try: # self.encoding = tiktoken.encoding_for_model(self.model_name) # except KeyError: # logger.warning(f"未找到模型 {self.model_name} 的专用编码器,使用默认编码器") # self.encoding = tiktoken.get_encoding("cl100k_base") def _generate(self, messages: list, params: dict) -> any: """实现OpenAI特定的生成逻辑""" try: response = self.client.chat.completions.create( model=self.model_name, messages=messages, **params ) return response except Exception as e: logger.error(f"OpenAI generation error: {str(e)}") raise def _process_response(self, response: any) -> str: """处理OpenAI的响应""" if not response or not response.choices: raise ValueError("Invalid response from OpenAI API") return response.choices[0].message.content.strip() def _count_tokens(self, messages: list) -> int: """计算token数量""" num_tokens = 0 for message in messages: num_tokens += 3 for key, value in message.items(): num_tokens += len(self.encoding.encode(str(value))) if key == "role": num_tokens += 1 num_tokens += 3 return num_tokens class GeminiGenerator(BaseGenerator): """原生Gemini API 生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str = None): super().__init__(model_name, api_key, prompt) self.base_url = base_url or "https://generativelanguage.googleapis.com/v1beta" self.client = None # 原生Gemini API参数 self.default_params = { "temperature": self.default_params["temperature"], "topP": self.default_params["top_p"], "topK": 40, "maxOutputTokens": 4000, "candidateCount": 1, "stopSequences": [] } class GeminiOpenAIGenerator(BaseGenerator): """OpenAI兼容的Gemini代理生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str = None): super().__init__(model_name, api_key, prompt) if not base_url: raise ValueError("OpenAI兼容的Gemini代理必须提供base_url") self.base_url = base_url.rstrip('/') # 使用OpenAI兼容接口 from openai import OpenAI self.client = OpenAI( api_key=api_key, base_url=base_url ) # OpenAI兼容接口参数 self.default_params = { "temperature": self.default_params["temperature"], "max_tokens": 4000, "stream": False } def _generate(self, messages: list, params: dict) -> any: """实现OpenAI兼容Gemini代理的生成逻辑""" try: response = self.client.chat.completions.create( model=self.model_name, messages=messages, **params ) return response except Exception as e: logger.error(f"OpenAI兼容Gemini代理生成错误: {str(e)}") raise def _process_response(self, response: any) -> str: """处理OpenAI兼容接口的响应""" if not response or not response.choices: raise ValueError("OpenAI兼容Gemini代理返回无效响应") return response.choices[0].message.content.strip() def _generate(self, messages: list, params: dict) -> any: """实现原生Gemini API的生成逻辑""" max_retries = 3 for attempt in range(max_retries): try: # 转换消息格式为Gemini格式 prompt = "\n".join([m["content"] for m in messages]) # 构建请求数据 request_data = { "contents": [{ "parts": [{"text": prompt}] }], "generationConfig": params, "safetySettings": [ { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" } ] } # 构建请求URL url = f"{self.base_url}/models/{self.model_name}:generateContent" # 发送请求 response = requests.post( url, json=request_data, headers={ "Content-Type": "application/json", "x-goog-api-key": self.api_key }, timeout=120 ) if response.status_code == 429: # 处理限流 wait_time = 65 if attempt == 0 else 30 logger.warning(f"原生Gemini API 触发限流,等待{wait_time}秒后重试...") time.sleep(wait_time) continue if response.status_code == 400: raise Exception(f"请求参数错误: {response.text}") elif response.status_code == 403: raise Exception(f"API密钥无效或权限不足: {response.text}") elif response.status_code != 200: raise Exception(f"原生Gemini API请求失败: {response.status_code} - {response.text}") response_data = response.json() # 检查响应格式 if "candidates" not in response_data or not response_data["candidates"]: if attempt < max_retries - 1: logger.warning("原生Gemini API 返回无效响应,等待30秒后重试...") time.sleep(30) continue else: raise Exception("原生Gemini API返回无效响应,可能触发了安全过滤") candidate = response_data["candidates"][0] # 检查是否被安全过滤阻止 if "finishReason" in candidate and candidate["finishReason"] == "SAFETY": raise Exception("内容被Gemini安全过滤器阻止") # 创建兼容的响应对象 class CompatibleResponse: def __init__(self, data): self.data = data candidate = data["candidates"][0] if "content" in candidate and "parts" in candidate["content"]: self.text = "" for part in candidate["content"]["parts"]: if "text" in part: self.text += part["text"] else: self.text = "" return CompatibleResponse(response_data) except requests.exceptions.RequestException as e: if attempt < max_retries - 1: logger.warning(f"网络请求失败,等待30秒后重试: {str(e)}") time.sleep(30) continue else: logger.error(f"原生Gemini API请求失败: {str(e)}") raise except Exception as e: if attempt < max_retries - 1 and "429" in str(e): logger.warning("原生Gemini API 触发限流,等待65秒后重试...") time.sleep(65) continue else: logger.error(f"原生Gemini 生成文案错误: {str(e)}") raise def _process_response(self, response: any) -> str: """处理原生Gemini API的响应""" if not response or not response.text: raise ValueError("原生Gemini API返回无效响应") return response.text.strip() class QwenGenerator(BaseGenerator): """阿里云千问 API 生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str): super().__init__(model_name, api_key, prompt) self.client = OpenAI( api_key=api_key, base_url=base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" ) # Qwen特定参数 self.default_params = { **self.default_params, "stream": False, "user": "script_generator" } def _generate(self, messages: list, params: dict) -> any: """实现千问特定的生成逻辑""" try: response = self.client.chat.completions.create( model=self.model_name, messages=messages, **params ) return response except Exception as e: logger.error(f"Qwen generation error: {str(e)}") raise def _process_response(self, response: any) -> str: """处理千问的响应""" if not response or not response.choices: raise ValueError("Invalid response from Qwen API") return response.choices[0].message.content.strip() class MoonshotGenerator(BaseGenerator): """Moonshot API 生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str): super().__init__(model_name, api_key, prompt) self.client = OpenAI( api_key=api_key, base_url=base_url or "https://api.moonshot.cn/v1" ) # Moonshot特定参数 self.default_params = { **self.default_params, "stream": False, "stop": None, "user": "script_generator", "tools": None } def _generate(self, messages: list, params: dict) -> any: """实现Moonshot特定的生成逻辑,包含429误重试机制""" while True: try: response = self.client.chat.completions.create( model=self.model_name, messages=messages, **params ) return response except Exception as e: error_str = str(e) if "Error code: 429" in error_str: logger.warning("Moonshot API 触发限流,等待65秒后重试...") time.sleep(65) # 等待65秒后重试 continue else: logger.error(f"Moonshot generation error: {error_str}") raise def _process_response(self, response: any) -> str: """处理Moonshot的响应""" if not response or not response.choices: raise ValueError("Invalid response from Moonshot API") return response.choices[0].message.content.strip() class DeepSeekGenerator(BaseGenerator): """DeepSeek API 生成器实现""" def __init__(self, model_name: str, api_key: str, prompt: str, base_url: str): super().__init__(model_name, api_key, prompt) self.client = OpenAI( api_key=api_key, base_url=base_url or "https://api.deepseek.com" ) # DeepSeek特定参数 self.default_params = { **self.default_params, "stream": False, "user": "script_generator" } def _generate(self, messages: list, params: dict) -> any: """实现DeepSeek特定的生成逻辑""" try: response = self.client.chat.completions.create( model=self.model_name, # deepseek-chat 或 deepseek-coder messages=messages, **params ) return response except Exception as e: logger.error(f"DeepSeek generation error: {str(e)}") raise def _process_response(self, response: any) -> str: """处理DeepSeek的响应""" if not response or not response.choices: raise ValueError("Invalid response from DeepSeek API") return response.choices[0].message.content.strip() class ScriptProcessor: def __init__(self, model_name: str, api_key: str = None, base_url: str = None, prompt: str = None, video_theme: str = ""): self.model_name = model_name self.api_key = api_key self.base_url = base_url self.video_theme = video_theme self.prompt = prompt or self._get_default_prompt() # 根据模型名称选择对应的生成器 logger.info(f"文本 LLM 提供商: {model_name}") if 'gemini' in model_name.lower(): self.generator = GeminiGenerator(model_name, self.api_key, self.prompt, self.base_url) elif 'qwen' in model_name.lower(): self.generator = QwenGenerator(model_name, self.api_key, self.prompt, self.base_url) elif 'moonshot' in model_name.lower(): self.generator = MoonshotGenerator(model_name, self.api_key, self.prompt, self.base_url) elif 'deepseek' in model_name.lower(): self.generator = DeepSeekGenerator(model_name, self.api_key, self.prompt, self.base_url) else: self.generator = OpenAIGenerator(model_name, self.api_key, self.prompt, self.base_url) def _get_default_prompt(self) -> str: return f""" 你是一位极具幽默感的短视频脚本创作大师,擅长用"温和的违反"制造笑点,让主题为 《{self.video_theme}》 的视频既有趣又富有传播力。 你的任务是将视频画面描述转化为能在社交平台疯狂传播的爆款口播文案。 目标受众:热爱生活、追求独特体验的18-35岁年轻人 文案风格:基于HKRR理论 + 段子手精神 主题:{self.video_theme} 【创作核心理念】 1. 敢于用"温和的违反"制造笑点,但不能过于冒犯 2. 巧妙运用中国式幽默,让观众会心一笑 3. 保持轻松愉快的叙事基调 【爆款内容四要素】 【快乐元素 Happy】 1. 用调侃的语气描述画面 2. 巧妙植入网络流行梗,增加内容的传播性 3. 适时自嘲,展现真实且有趣的一面 【知识价值 Knowledge】 1. 用段子手的方式解释专业知识 2. 在幽默中传递实用的生活常识 【情感共鸣 Resonance】 1. 描述"真实但夸张"的环境描述 2. 把对自然的感悟融入俏皮话中 3. 用接地气的表达方式拉近与观众距离 【节奏控制 Rhythm】 1. 像讲段子一样,注意铺垫和包袱的节奏 2. 确保每段都有笑点,但不强求 3. 段落结尾干净利落,不拖泥带水 【连贯性要求】 1. 新生成的内容必须自然衔接上一段文案的结尾 2. 使用恰当的连接词和过渡语,确保叙事流畅 3. 保持人物视角和语气的一致性 4. 避免重复上一段已经提到的信息 5. 确保情节的逻辑连续性 我会按顺序提供多段视频画面描述。请创作既搞笑又能火爆全网的口播文案。 记住:要敢于用"温和的违反"制造笑点,但要把握好尺度,让观众在轻松愉快中感受到乐趣。""" def calculate_duration_and_word_count(self, time_range: str) -> int: """ 计算时间范围的持续时长并估算合适的字数 Args: time_range: 时间范围字符串,格式为 "HH:MM:SS,mmm-HH:MM:SS,mmm" 例如: "00:00:50,100-00:01:21,500" Returns: int: 估算的合适字数 基于经验公式: 每0.35秒可以说一个字 例如: 10秒可以说约28个字 (10/0.35≈28.57) """ try: start_str, end_str = time_range.split('-') def time_to_seconds(time_str: str) -> float: """ 将时间字符串转换为秒数(带毫秒精度) Args: time_str: 时间字符串,格式为 "HH:MM:SS,mmm" 例如: "00:00:50,100" 表示50.1秒 Returns: float: 转换后的秒数(带毫秒) """ try: # 处理毫秒部分 time_part, ms_part = time_str.split(',') hours, minutes, seconds = map(int, time_part.split(':')) milliseconds = int(ms_part) # 转换为秒 total_seconds = (hours * 3600) + (minutes * 60) + seconds + (milliseconds / 1000) return total_seconds except ValueError as e: logger.warning(f"时间格式解析错误: {time_str}, error: {e}") return 0.0 # 计算开始和结束时间的秒数 start_seconds = time_to_seconds(start_str) end_seconds = time_to_seconds(end_str) # 计算持续时间(秒) duration = end_seconds - start_seconds # 根据经验公式计算字数: 每0.5秒一个字 word_count = int(duration / 0.4) # 确保字数在合理范围内 word_count = max(10, min(word_count, 500)) # 限制在10-500字之间 logger.debug(f"时间范围 {time_range} 的持续时间为 {duration:.3f}秒, 估算字数: {word_count}") return word_count except Exception as e: logger.warning(f"字数计算错误: {traceback.format_exc()}") return 100 # 发生错误时返回默认字数 def process_frames(self, frame_content_list: List[Dict]) -> List[Dict]: for frame_content in frame_content_list: word_count = self.calculate_duration_and_word_count(frame_content["timestamp"]) script = self.generator.generate_script(frame_content["picture"], word_count) frame_content["narration"] = script frame_content["OST"] = 2 logger.info(f"时间范围: {frame_content['timestamp']}, 建议字数: {word_count}") logger.info(script) self._save_results(frame_content_list) return frame_content_list def _save_results(self, frame_content_list: List[Dict]): """保存处理结果,并添加新的时间戳""" try: def format_timestamp(seconds: float) -> str: """将秒数转换为 HH:MM:SS,mmm 格式""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) seconds_remainder = seconds % 60 whole_seconds = int(seconds_remainder) milliseconds = int((seconds_remainder - whole_seconds) * 1000) return f"{hours:02d}:{minutes:02d}:{whole_seconds:02d},{milliseconds:03d}" # 计算新的时间戳 current_time = 0.0 # 当前时间点(秒,包含毫秒) for frame in frame_content_list: # 获取原始时间戳的持续时间 start_str, end_str = frame['timestamp'].split('-') def time_to_seconds(time_str: str) -> float: """将时间字符串转换为秒数(包含毫秒)""" try: if ',' in time_str: time_part, ms_part = time_str.split(',') ms = float(ms_part) / 1000 else: time_part = time_str ms = 0 parts = time_part.split(':') if len(parts) == 3: # HH:MM:SS h, m, s = map(float, parts) seconds = h * 3600 + m * 60 + s elif len(parts) == 2: # MM:SS m, s = map(float, parts) seconds = m * 60 + s else: # SS seconds = float(parts[0]) return seconds + ms except Exception as e: logger.error(f"时间格式转换错误 {time_str}: {str(e)}") return 0.0 # 计算当前片段的持续时间 start_seconds = time_to_seconds(start_str) end_seconds = time_to_seconds(end_str) duration = end_seconds - start_seconds # 设置新的时间戳 new_start = format_timestamp(current_time) new_end = format_timestamp(current_time + duration) frame['new_timestamp'] = f"{new_start}-{new_end}" # 更新当前时间点 current_time += duration # 保存结果 file_name = f"storage/json/step2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" os.makedirs(os.path.dirname(file_name), exist_ok=True) with open(file_name, 'w', encoding='utf-8') as file: json.dump(frame_content_list, file, ensure_ascii=False, indent=4) logger.info(f"保存脚本成功,总时长: {format_timestamp(current_time)}") except Exception as e: logger.error(f"保存结果时发生错误: {str(e)}\n{traceback.format_exc()}") raise ================================================ FILE: app/utils/utils.py ================================================ import locale import os import traceback import requests import threading from typing import Any from loguru import logger import streamlit as st import json from uuid import uuid4 import urllib3 from datetime import datetime, timedelta from app.models import const from app.utils import check_script from app.services import material urllib3.disable_warnings() def get_response(status: int, data: Any = None, message: str = ""): obj = { "status": status, } if data: obj["data"] = data if message: obj["message"] = message return obj def to_json(obj): try: # 定义一个辅助函数来处理不同类型的对象 def serialize(o): # 如果对象是可序列化类型,直接返回 if isinstance(o, (int, float, bool, str)) or o is None: return o # 如果对象是二进制数据,转换为base64编码的字符串 elif isinstance(o, bytes): return "*** binary data ***" # 如果象是字典,递归处理每个键值对 elif isinstance(o, dict): return {k: serialize(v) for k, v in o.items()} # 如果对象是列表或元组,递归处理每个元素 elif isinstance(o, (list, tuple)): return [serialize(item) for item in o] # 如果对象是自定义类型,尝试返回其__dict__属性 elif hasattr(o, "__dict__"): return serialize(o.__dict__) # 其他情况返回None(或者可以选择抛出异常) else: return None # 使用serialize函数处理输入对象 serialized_obj = serialize(obj) # 序列化处理后的对象为JSON符串 return json.dumps(serialized_obj, ensure_ascii=False, indent=4) except Exception as e: return None def get_uuid(remove_hyphen: bool = False): u = str(uuid4()) if remove_hyphen: u = u.replace("-", "") return u def root_dir(): return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) def storage_dir(sub_dir: str = "", create: bool = False): d = os.path.join(root_dir(), "storage") if sub_dir: d = os.path.join(d, sub_dir) if create and not os.path.exists(d): os.makedirs(d) return d def resource_dir(sub_dir: str = ""): d = os.path.join(root_dir(), "resource") if sub_dir: d = os.path.join(d, sub_dir) return d def task_dir(sub_dir: str = ""): d = os.path.join(storage_dir(), "tasks") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def font_dir(sub_dir: str = ""): d = resource_dir("fonts") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def song_dir(sub_dir: str = ""): d = resource_dir("songs") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""): """ 获取背景音乐文件路径 Args: bgm_type: 背景音乐类型,可选值: random(随机), ""(无背景音乐) bgm_file: 指定的背景音乐文件路径 Returns: str: 背景音乐文件路径 """ import glob import random if not bgm_type: return "" if bgm_file and os.path.exists(bgm_file): return bgm_file if bgm_type == "random": song_dir_path = song_dir() # 检查目录是否存在 if not os.path.exists(song_dir_path): logger.warning(f"背景音乐目录不存在: {song_dir_path}") return "" # 支持 mp3 和 flac 格式 mp3_files = glob.glob(os.path.join(song_dir_path, "*.mp3")) flac_files = glob.glob(os.path.join(song_dir_path, "*.flac")) files = mp3_files + flac_files # 检查是否找到音乐文件 if not files: logger.warning(f"在目录 {song_dir_path} 中没有找到 MP3 或 FLAC 文件") return "" return random.choice(files) return "" def public_dir(sub_dir: str = ""): d = resource_dir(f"public") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def srt_dir(sub_dir: str = ""): d = resource_dir(f"srt") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def run_in_background(func, *args, **kwargs): def run(): try: func(*args, **kwargs) except Exception as e: logger.error(f"run_in_background error: {e}") thread = threading.Thread(target=run) thread.start() return thread def time_convert_seconds_to_hmsm(seconds) -> str: hours = int(seconds // 3600) seconds = seconds % 3600 minutes = int(seconds // 60) milliseconds = int(seconds * 1000) % 1000 seconds = int(seconds % 60) return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds) def format_time(seconds: float) -> str: """ 将秒数转换为格式化的时间字符串 (HH:MM:SS,mmm) 参数: seconds: 需要转换的秒数,可以是整数或浮点数 返回: 格式化的时间字符串,格式为 HH:MM:SS,mmm """ # 计算小时、分钟、秒和毫秒 hours = int(seconds // 3600) remaining_seconds = seconds % 3600 minutes = int(remaining_seconds // 60) remaining_seconds = remaining_seconds % 60 secs = int(remaining_seconds) milliseconds = int((remaining_seconds - secs) * 1000) # 格式化为时间字符串 return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, secs, milliseconds) def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str: start_time = time_convert_seconds_to_hmsm(start_time) end_time = time_convert_seconds_to_hmsm(end_time) srt = """%d %s --> %s %s """ % ( idx, start_time, end_time, msg, ) return srt def str_contains_punctuation(word): for p in const.PUNCTUATIONS: if p in word: return True return False def split_string_by_punctuations(s): result = [] txt = "" previous_char = "" next_char = "" for i in range(len(s)): char = s[i] if char == "\n": result.append(txt.strip()) txt = "" continue if i > 0: previous_char = s[i - 1] if i < len(s) - 1: next_char = s[i + 1] if char == "." and previous_char.isdigit() and next_char.isdigit(): # 取现1万,按2.5%收取手续费, 2.5 中的 . 不能作为换行标记 txt += char continue if char not in const.PUNCTUATIONS: txt += char else: result.append(txt.strip()) txt = "" result.append(txt.strip()) # filter empty string result = list(filter(None, result)) return result def md5(text): import hashlib return hashlib.md5(text.encode("utf-8")).hexdigest() def get_system_locale(): try: loc = locale.getdefaultlocale() # zh_CN, zh_TW return zh # en_US, en_GB return en language_code = loc[0].split("_")[0] return language_code except Exception as e: return "en" def load_locales(i18n_dir): _locales = {} for root, dirs, files in os.walk(i18n_dir): for file in files: if file.endswith(".json"): lang = file.split(".")[0] with open(os.path.join(root, file), "r", encoding="utf-8") as f: _locales[lang] = json.loads(f.read()) return _locales def parse_extension(filename): return os.path.splitext(filename)[1].strip().lower().replace(".", "") def script_dir(sub_dir: str = ""): d = resource_dir(f"scripts") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def video_dir(sub_dir: str = ""): d = resource_dir(f"videos") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def subtitle_dir(sub_dir: str = ""): d = resource_dir(f"srt") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def split_timestamp(timestamp): """ 拆分时间戳 """ start, end = timestamp.split('-') start_hour, start_minute = map(int, start.split(':')) end_hour, end_minute = map(int, end.split(':')) start_time = '00:{:02d}:{:02d}'.format(start_hour, start_minute) end_time = '00:{:02d}:{:02d}'.format(end_hour, end_minute) return start_time, end_time def reduce_video_time(txt: str, duration: float = 0.21531): """ 按照字数缩减视频时长,一个字耗时约 0.21531 s, Returns: """ # 返回结果四舍五入为整数 duration = len(txt) * duration return int(duration) def get_current_country(): """ 判断当前网络IP地址所在的国家 """ try: # 使用ipapi.co的免费API获取IP地址信息 response = requests.get('https://ipapi.co/json/') data = response.json() # 获取国家名称 country = data.get('country_name') if country: logger.debug(f"当前网络IP地址位于:{country}") return country else: logger.debug("无法确定当前网络IP地址所在的国家") return None except requests.RequestException: logger.error("获取IP地址信息时发生错误,请检查网络连接") return None def time_to_seconds(time_str: str) -> float: """ 将时间字符串转换为秒数,支持多种格式: - "HH:MM:SS,mmm" -> 小时:分钟:秒,毫秒 - "MM:SS,mmm" -> 分钟:秒,毫秒 - "SS,mmm" -> 秒,毫秒 - "SS-mmm" -> 秒-毫秒 Args: time_str: 时间字符串 Returns: float: 转换后的秒数(包含毫秒) """ try: # 处理带有'-'的毫秒格式 if '-' in time_str: time_part, ms_part = time_str.split('-') ms = float(ms_part) / 1000 # 处理带有','的毫秒格式 elif ',' in time_str: time_part, ms_part = time_str.split(',') ms = float(ms_part) / 1000 else: time_part = time_str ms = 0 # 分割时间部分 parts = time_part.split(':') if len(parts) == 3: # HH:MM:SS h, m, s = map(float, parts) seconds = h * 3600 + m * 60 + s elif len(parts) == 2: # MM:SS m, s = map(float, parts) seconds = m * 60 + s else: # SS seconds = float(parts[0]) return seconds + ms except (ValueError, IndexError) as e: logger.error(f"时间格式转换错误 {time_str}: {str(e)}") return 0.0 def seconds_to_time(seconds: float) -> str: h, remainder = divmod(seconds, 3600) m, s = divmod(remainder, 60) return f"{int(h):02d}:{int(m):02d}:{s:06.3f}" def calculate_total_duration(scenes): """ 计算场景列表的总时长 Args: scenes: 场景列表,每个场景包含 timestamp 字段,格式如 "00:00:28,350-00:00:41,000" Returns: float: 总时长(秒) """ total_seconds = 0 for scene in scenes: start, end = scene['timestamp'].split('-') # 使用 time_to_seconds 函数处理更精确的时间格式 start_seconds = time_to_seconds(start) end_seconds = time_to_seconds(end) duration = end_seconds - start_seconds total_seconds += duration return total_seconds def add_new_timestamps(scenes): """ 新增新视频的时间戳,并为"原生播放"的narration添加唯一标识符 Args: scenes: 场景列表 Returns: 更新后的场景列表 """ current_time = timedelta() updated_scenes = [] # 保存脚本前先检查脚本是否正确 check_script.check_script(scenes, calculate_total_duration(scenes)) for scene in scenes: new_scene = scene.copy() # 创建场景的副本,以保留原始数据 start, end = new_scene['timestamp'].split('-') start_time = datetime.strptime(start, '%M:%S') end_time = datetime.strptime(end, '%M:%S') duration = end_time - start_time new_start = current_time current_time += duration new_end = current_time # 将 timedelta 转换为分钟和秒 new_start_str = f"{int(new_start.total_seconds() // 60):02d}:{int(new_start.total_seconds() % 60):02d}" new_end_str = f"{int(new_end.total_seconds() // 60):02d}:{int(new_end.total_seconds() % 60):02d}" new_scene['new_timestamp'] = f"{new_start_str}-{new_end_str}" # 为"原生播放"的narration添加唯一标识符 if new_scene.get('narration') == "" or new_scene.get('narration') == None: unique_id = str(uuid4())[:8] # 使用UUID的前8个字符作为唯一标识符 new_scene['narration'] = f"原声播放_{unique_id}" updated_scenes.append(new_scene) return updated_scenes def clean_model_output(output): # 移除可能的代码块标记 output = output.strip('```json').strip('```') # 移除开头和结尾的空白字符 output = output.strip() return output def cut_video(params, progress_callback=None): """ 旧的视频裁剪函数 - 已弃用 注意:此函数已被统一裁剪策略取代,不再推荐使用。 新的实现请使用 task.start_subclip_unified() 函数。 """ try: task_id = str(uuid4()) st.session_state['task_id'] = task_id if not st.session_state.get('video_clip_json'): raise ValueError("视频脚本不能为空") video_script_list = st.session_state['video_clip_json'] time_list = [i['timestamp'] for i in video_script_list] def clip_progress(current, total): progress = int((current / total) * 100) if progress_callback: progress_callback(progress) subclip_videos = material.clip_videos( task_id=task_id, timestamp_terms=time_list, origin_video=params.video_origin_path, progress_callback=clip_progress ) if subclip_videos is None: raise ValueError("裁剪视频失败") st.session_state['subclip_videos'] = subclip_videos for i, video_script in enumerate(video_script_list): try: video_script['path'] = subclip_videos[i+1] except KeyError as err: logger.error(f"裁剪视频失败: {err}") return task_id, subclip_videos except Exception as e: logger.error(f"视频裁剪过程中发生错误: \n{traceback.format_exc()}") raise def temp_dir(sub_dir: str = ""): """ 获取临时文件目录 Args: sub_dir: 子目录名 Returns: str: 临时文件目录路径 """ d = os.path.join(storage_dir(), "temp") if sub_dir: d = os.path.join(d, sub_dir) if not os.path.exists(d): os.makedirs(d) return d def clear_keyframes_cache(video_path: str = None): """ 清理关键帧缓存 Args: video_path: 视频文件路径,如果指定则只清理该视频的缓存 """ try: keyframes_dir = os.path.join(temp_dir(), "keyframes") if not os.path.exists(keyframes_dir): return if video_path: # 理指定视频的缓存 video_hash = md5(video_path + str(os.path.getmtime(video_path))) video_keyframes_dir = os.path.join(keyframes_dir, video_hash) if os.path.exists(video_keyframes_dir): import shutil shutil.rmtree(video_keyframes_dir) logger.info(f"已清理视频关键帧缓存: {video_path}") else: # 清理所有缓存 import shutil shutil.rmtree(keyframes_dir) logger.info("已清理所有关键帧缓存") except Exception as e: logger.error(f"清理关键帧缓存失败: {e}") def init_resources(): """初始化资源文件""" try: # 创建字体目录 font_dir = os.path.join(root_dir(), "resource", "fonts") os.makedirs(font_dir, exist_ok=True) # 检查字体文件 font_files = [ ("SourceHanSansCN-Regular.otf", "https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"), ("simhei.ttf", "C:/Windows/Fonts/simhei.ttf"), # Windows 黑体 ("simkai.ttf", "C:/Windows/Fonts/simkai.ttf"), # Windows 楷体 ("simsun.ttc", "C:/Windows/Fonts/simsun.ttc"), # Windows 宋体 ] # 优先使用系统字体 system_font_found = False for font_name, source in font_files: if not source.startswith("http") and os.path.exists(source): target_path = os.path.join(font_dir, font_name) if not os.path.exists(target_path): import shutil shutil.copy2(source, target_path) logger.info(f"已复制系统字体: {font_name}") system_font_found = True break # 如果没有找到系统字体,则下载思源黑体 if not system_font_found: source_han_path = os.path.join(font_dir, "SourceHanSansCN-Regular.otf") if not os.path.exists(source_han_path): download_font(font_files[0][1], source_han_path) except Exception as e: logger.error(f"初始化资源文件失败: {e}") def download_font(url: str, font_path: str): """下载字体文件""" try: logger.info(f"正在下载字体文件: {url}") import requests response = requests.get(url) response.raise_for_status() with open(font_path, 'wb') as f: f.write(response.content) logger.info(f"字体文件下载成功: {font_path}") except Exception as e: logger.error(f"下载字体文件失败: {e}") raise def init_imagemagick(): """初始化 ImageMagick 配置""" try: # 检查 ImageMagick 是否已安装 import subprocess result = subprocess.run(['magick', '-version'], capture_output=True, text=True) if result.returncode != 0: logger.error("ImageMagick 未安装或配置不正确") return False # 设置 IMAGEMAGICK_BINARY 环境变量 os.environ['IMAGEMAGICK_BINARY'] = 'magick' return True except Exception as e: logger.error(f"初始化 ImageMagick 失败: {str(e)}") return False ================================================ FILE: app/utils/video_processor.py ================================================ """ 视频帧提取工具 这个模块提供了简单高效的视频帧提取功能。主要特点: 1. 使用ffmpeg进行视频处理,支持硬件加速 2. 按指定时间间隔提取视频关键帧 3. 支持多种视频格式 4. 支持高清视频帧输出 5. 直接从原视频提取高质量关键帧 不依赖OpenCV和sklearn等库,只使用ffmpeg作为外部依赖,降低了安装和使用的复杂度。 """ import os import re import time import subprocess from typing import List, Dict from loguru import logger from tqdm import tqdm from app.utils import ffmpeg_utils from app.config.ffmpeg_config import FFmpegConfigManager class VideoProcessor: def __init__(self, video_path: str): """ 初始化视频处理器 Args: video_path: 视频文件路径 """ if not os.path.exists(video_path): raise FileNotFoundError(f"视频文件不存在: {video_path}") self.video_path = video_path self.video_info = self._get_video_info() self.fps = float(self.video_info.get('fps', 25)) self.duration = float(self.video_info.get('duration', 0)) self.width = int(self.video_info.get('width', 0)) self.height = int(self.video_info.get('height', 0)) self.total_frames = int(self.fps * self.duration) def _get_video_info(self) -> Dict[str, str]: """ 使用ffprobe获取视频信息 Returns: Dict[str, str]: 包含视频基本信息的字典 """ cmd = [ "ffprobe", "-v", "error", "-select_streams", "v:0", "-show_entries", "stream=width,height,r_frame_rate,duration", "-of", "default=noprint_wrappers=1:nokey=0", self.video_path ] try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) lines = result.stdout.strip().split('\n') info = {} for line in lines: if '=' in line: key, value = line.split('=', 1) info[key] = value # 处理帧率(可能是分数形式) if 'r_frame_rate' in info: try: num, den = map(int, info['r_frame_rate'].split('/')) info['fps'] = str(num / den) except ValueError: info['fps'] = info.get('r_frame_rate', '25') return info except subprocess.CalledProcessError as e: logger.error(f"获取视频信息失败: {e.stderr}") return { 'width': '1280', 'height': '720', 'fps': '25', 'duration': '0' } def extract_frames_by_interval(self, output_dir: str, interval_seconds: float = 5.0, use_hw_accel: bool = True) -> List[int]: """ 按指定时间间隔提取视频帧 优化了 Windows 系统兼容性,特别是 N 卡硬件加速的滤镜链问题 Args: output_dir: 输出目录 interval_seconds: 帧提取间隔(秒) use_hw_accel: 是否使用硬件加速 Returns: List[int]: 提取的帧号列表 """ if not os.path.exists(output_dir): os.makedirs(output_dir) # 计算起始时间和帧提取点 start_time = 0 end_time = self.duration extraction_times = [] current_time = start_time while current_time < end_time: extraction_times.append(current_time) current_time += interval_seconds if not extraction_times: logger.warning("未找到需要提取的帧") return [] # 获取硬件加速信息 hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() hwaccel_type = hwaccel_info.get("type", "software") # 提取帧 - 使用优化的进度条 frame_numbers = [] successful_extractions = 0 failed_extractions = 0 logger.info(f"开始提取 {len(extraction_times)} 个关键帧,使用 {hwaccel_type} 加速") with tqdm(total=len(extraction_times), desc="🎬 提取视频帧", unit="帧", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]") as pbar: for i, timestamp in enumerate(extraction_times): frame_number = int(timestamp * self.fps) frame_numbers.append(frame_number) # 格式化时间戳字符串 (HHMMSSmmm) hours = int(timestamp // 3600) minutes = int((timestamp % 3600) // 60) seconds = int(timestamp % 60) milliseconds = int((timestamp % 1) * 1000) time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg") # 构建 FFmpeg 命令 - 针对 Windows N 卡优化 success = self._extract_single_frame_optimized( timestamp, output_path, use_hw_accel, hwaccel_type ) if success: successful_extractions += 1 pbar.set_postfix({ "✅": successful_extractions, "❌": failed_extractions, "时间": f"{timestamp:.1f}s" }) else: failed_extractions += 1 pbar.set_postfix({ "✅": successful_extractions, "❌": failed_extractions, "时间": f"{timestamp:.1f}s" }) pbar.update(1) # 统计结果 total_attempts = len(extraction_times) success_rate = (successful_extractions / total_attempts) * 100 if total_attempts > 0 else 0 logger.info(f"关键帧提取完成: 成功 {successful_extractions}/{total_attempts} 帧 ({success_rate:.1f}%)") if failed_extractions > 0: logger.warning(f"有 {failed_extractions} 帧提取失败,可能是硬件加速兼容性问题") # 验证实际生成的文件 actual_files = [f for f in os.listdir(output_dir) if f.endswith('.jpg')] logger.info(f"实际生成文件数量: {len(actual_files)} 个") if len(actual_files) == 0: logger.error("未生成任何关键帧文件,可能需要禁用硬件加速") raise Exception("关键帧提取完全失败,请检查视频文件和 FFmpeg 配置") return frame_numbers def _extract_single_frame_optimized(self, timestamp: float, output_path: str, use_hw_accel: bool, hwaccel_type: str) -> bool: """ 优化的单帧提取方法,解决 Windows N 卡硬件加速兼容性问题 Args: timestamp: 时间戳(秒) output_path: 输出文件路径 use_hw_accel: 是否使用硬件加速 hwaccel_type: 硬件加速类型 Returns: bool: 是否成功提取 """ # 策略1: 优先尝试纯编码器方案(避免硬件解码滤镜链问题) if use_hw_accel and hwaccel_type in ["nvenc", "cuda"]: # 对于 NVIDIA 显卡,优先使用纯软件解码 + NVENC 编码 if self._try_extract_with_software_decode(timestamp, output_path): return True # 策略2: 尝试标准硬件加速 if use_hw_accel and ffmpeg_utils.is_ffmpeg_hwaccel_available(): hw_accel = ffmpeg_utils.get_ffmpeg_hwaccel_args() if self._try_extract_with_hwaccel(timestamp, output_path, hw_accel): return True # 策略3: 软件方案 if self._try_extract_with_software(timestamp, output_path): return True # 策略4: 超级兼容性方案(Windows 特殊处理) return self._try_extract_with_ultra_compatibility(timestamp, output_path) def _try_extract_with_software_decode(self, timestamp: float, output_path: str) -> bool: """ 使用纯软件解码提取帧(推荐用于 Windows N 卡) 参考 clip_video.py 中的成功实现 Args: timestamp: 时间戳 output_path: 输出路径 Returns: bool: 是否成功 """ # 参考 clip_video.py 中的兼容性方案,专门针对图片输出优化 cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-ss", str(timestamp), # 先定位时间戳 "-i", self.video_path, "-vframes", "1", # 只提取一帧 "-q:v", "2", # 高质量 "-pix_fmt", "yuv420p", # 明确指定像素格式 "-y", output_path ] return self._execute_ffmpeg_command(cmd, f"软件解码提取帧 {timestamp:.1f}s") def _try_extract_with_hwaccel(self, timestamp: float, output_path: str, hw_accel: List[str]) -> bool: """ 使用硬件加速提取帧 Args: timestamp: 时间戳 output_path: 输出路径 hw_accel: 硬件加速参数 Returns: bool: 是否成功 """ cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", ] # 添加硬件加速参数 cmd.extend(hw_accel) cmd.extend([ "-ss", str(timestamp), "-i", self.video_path, "-vframes", "1", "-q:v", "2", "-pix_fmt", "yuv420p", "-y", output_path ]) return self._execute_ffmpeg_command(cmd, f"硬件加速提取帧 {timestamp:.1f}s") def _try_extract_with_software(self, timestamp: float, output_path: str) -> bool: """ 使用纯软件方案提取帧(最后的备用方案) 参考 clip_video.py 中的基本编码方案 Args: timestamp: 时间戳 output_path: 输出路径 Returns: bool: 是否成功 """ # 最基本的兼容性方案,参考 clip_video.py 的 try_basic_fallback cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "warning", # 更详细的日志用于调试 "-ss", str(timestamp), "-i", self.video_path, "-vframes", "1", "-q:v", "3", # 稍微降低质量以提高兼容性 "-pix_fmt", "yuv420p", "-avoid_negative_ts", "make_zero", # 避免时间戳问题 "-y", output_path ] return self._execute_ffmpeg_command(cmd, f"软件方案提取帧 {timestamp:.1f}s") def _try_extract_with_ultra_compatibility(self, timestamp: float, output_path: str) -> bool: """ 超级兼容性方案,专门解决 Windows 系统的 MJPEG 编码问题 Args: timestamp: 时间戳 output_path: 输出路径 Returns: bool: 是否成功 """ # 方案1: 使用 PNG 格式避免 MJPEG 问题 png_output = output_path.replace('.jpg', '.png') cmd1 = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-ss", str(timestamp), "-i", self.video_path, "-vframes", "1", "-f", "image2", # 明确指定图片格式 "-y", png_output ] if self._execute_ffmpeg_command(cmd1, f"PNG格式提取帧 {timestamp:.1f}s"): # 如果 PNG 成功,转换为 JPG try: from PIL import Image with Image.open(png_output) as img: # 转换为 RGB 模式(去除 alpha 通道) if img.mode in ('RGBA', 'LA'): background = Image.new('RGB', img.size, (255, 255, 255)) background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) img = background img.save(output_path, 'JPEG', quality=90) # 删除临时 PNG 文件 os.remove(png_output) return True except Exception as e: logger.debug(f"PNG 转 JPG 失败: {e}") # 如果转换失败,直接重命名 PNG 为 JPG try: os.rename(png_output, output_path) return True except Exception: pass # 方案2: 使用最简单的参数 cmd2 = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-i", self.video_path, "-ss", str(timestamp), # 把 -ss 放在 -i 后面 "-vframes", "1", "-f", "mjpeg", # 明确指定 MJPEG 格式 "-q:v", "5", # 降低质量要求 "-y", output_path ] if self._execute_ffmpeg_command(cmd2, f"MJPEG格式提取帧 {timestamp:.1f}s"): return True # 方案3: 最后的尝试 - 使用 BMP 格式 bmp_output = output_path.replace('.jpg', '.bmp') cmd3 = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-i", self.video_path, "-ss", str(timestamp), "-vframes", "1", "-f", "bmp", "-y", bmp_output ] if self._execute_ffmpeg_command(cmd3, f"BMP格式提取帧 {timestamp:.1f}s"): # 尝试转换 BMP 为 JPG try: from PIL import Image with Image.open(bmp_output) as img: img.save(output_path, 'JPEG', quality=90) os.remove(bmp_output) return True except Exception: # 如果转换失败,直接重命名 try: os.rename(bmp_output, output_path) return True except Exception: pass return False def _execute_ffmpeg_command(self, cmd: List[str], description: str) -> bool: """ 执行 FFmpeg 命令并处理结果 参考 clip_video.py 中的错误处理机制 Args: cmd: FFmpeg 命令列表 description: 操作描述 Returns: bool: 是否成功 """ try: # 参考 clip_video.py 中的 Windows 处理方式 is_windows = os.name == 'nt' process_kwargs = { "stdout": subprocess.PIPE, "stderr": subprocess.PIPE, "text": True, "check": True, "timeout": 30 # 30秒超时 } if is_windows: process_kwargs["encoding"] = 'utf-8' result = subprocess.run(cmd, **process_kwargs) # 验证输出文件 output_path = cmd[-1] if os.path.exists(output_path) and os.path.getsize(output_path) > 0: return True else: return False except subprocess.CalledProcessError as e: # 简化错误日志,仅记录关键信息 return False except subprocess.TimeoutExpired: return False except Exception as e: return False def _detect_hw_accelerator(self) -> List[str]: """ 检测系统可用的硬件加速器 Returns: List[str]: 硬件加速器ffmpeg命令参数 """ # 使用集中式硬件加速检测 if ffmpeg_utils.is_ffmpeg_hwaccel_available(): return ffmpeg_utils.get_ffmpeg_hwaccel_args() return [] def process_video_pipeline(self, output_dir: str, interval_seconds: float = 5.0, # 帧提取间隔(秒) use_hw_accel: bool = True) -> None: """ 执行简化的视频处理流程,直接从原视频按固定时间间隔提取帧 Args: output_dir: 输出目录 interval_seconds: 帧提取间隔(秒) use_hw_accel: 是否使用硬件加速 """ # 创建输出目录 os.makedirs(output_dir, exist_ok=True) try: # 直接从原视频提取关键帧 logger.info(f"从视频间隔 {interval_seconds} 秒提取关键帧...") self.extract_frames_by_interval( output_dir, interval_seconds=interval_seconds, use_hw_accel=use_hw_accel ) logger.info(f"处理完成!视频帧已保存在: {output_dir}") except Exception as e: import traceback logger.error(f"视频处理失败: \n{traceback.format_exc()}") raise def extract_frames_by_interval_ultra_compatible(self, output_dir: str, interval_seconds: float = 5.0) -> List[int]: """ 使用超级兼容性方案按指定时间间隔提取视频帧 直接使用PNG格式提取,避免MJPEG编码问题,确保最高兼容性 Args: output_dir: 输出目录 interval_seconds: 帧提取间隔(秒) Returns: List[int]: 提取的帧号列表 """ if not os.path.exists(output_dir): os.makedirs(output_dir) # 计算起始时间和帧提取点 start_time = 0 end_time = self.duration extraction_times = [] current_time = start_time while current_time < end_time: extraction_times.append(current_time) current_time += interval_seconds if not extraction_times: logger.warning("未找到需要提取的帧") return [] # 提取帧 - 使用美化的进度条 frame_numbers = [] successful_extractions = 0 failed_extractions = 0 logger.info(f"开始提取 {len(extraction_times)} 个关键帧,使用超级兼容性方案") with tqdm(total=len(extraction_times), desc="🎬 提取关键帧", unit="帧", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]") as pbar: for i, timestamp in enumerate(extraction_times): frame_number = int(timestamp * self.fps) frame_numbers.append(frame_number) # 格式化时间戳字符串 (HHMMSSmmm) hours = int(timestamp // 3600) minutes = int((timestamp % 3600) // 60) seconds = int(timestamp % 60) milliseconds = int((timestamp % 1) * 1000) time_str = f"{hours:02d}{minutes:02d}{seconds:02d}{milliseconds:03d}" output_path = os.path.join(output_dir, f"keyframe_{frame_number:06d}_{time_str}.jpg") # 直接使用超级兼容性方案 success = self._extract_frame_ultra_compatible(timestamp, output_path) if success: successful_extractions += 1 pbar.set_postfix({ "✅": successful_extractions, "❌": failed_extractions, "时间": f"{timestamp:.1f}s" }) else: failed_extractions += 1 pbar.set_postfix({ "✅": successful_extractions, "❌": failed_extractions, "时间": f"{timestamp:.1f}s" }) pbar.update(1) # 统计结果 total_attempts = len(extraction_times) success_rate = (successful_extractions / total_attempts) * 100 if total_attempts > 0 else 0 logger.info(f"关键帧提取完成: 成功 {successful_extractions}/{total_attempts} 帧 ({success_rate:.1f}%)") if failed_extractions > 0: logger.warning(f"有 {failed_extractions} 帧提取失败") # 验证实际生成的文件 actual_files = [f for f in os.listdir(output_dir) if f.endswith('.jpg')] logger.info(f"实际生成文件数量: {len(actual_files)} 个") if len(actual_files) == 0: logger.error("未生成任何关键帧文件") raise Exception("关键帧提取完全失败,请检查视频文件") return frame_numbers def _extract_frame_ultra_compatible(self, timestamp: float, output_path: str) -> bool: """ 超级兼容性方案提取单帧 Args: timestamp: 时间戳(秒) output_path: 输出文件路径 Returns: bool: 是否成功提取 """ # 使用 PNG 格式避免 MJPEG 问题 png_output = output_path.replace('.jpg', '.png') cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-ss", str(timestamp), "-i", self.video_path, "-vframes", "1", "-f", "image2", # 明确指定图片格式 "-y", png_output ] try: # 执行FFmpeg命令 result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=30) # 验证PNG文件是否成功生成 if os.path.exists(png_output) and os.path.getsize(png_output) > 0: # 转换PNG为JPG try: from PIL import Image with Image.open(png_output) as img: # 转换为 RGB 模式(去除 alpha 通道) if img.mode in ('RGBA', 'LA'): background = Image.new('RGB', img.size, (255, 255, 255)) background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None) img = background img.save(output_path, 'JPEG', quality=90) # 删除临时 PNG 文件 os.remove(png_output) return True except Exception as e: logger.warning(f"PNG 转 JPG 失败: {e}") # 如果转换失败,直接重命名 PNG 为 JPG try: os.rename(png_output, output_path) return True except Exception: return False else: return False except subprocess.CalledProcessError as e: logger.warning(f"超级兼容性方案提取帧 {timestamp:.1f}s 失败: {e}") return False except subprocess.TimeoutExpired: logger.warning(f"超级兼容性方案提取帧 {timestamp:.1f}s 超时") return False except Exception as e: logger.warning(f"超级兼容性方案提取帧 {timestamp:.1f}s 异常: {e}") return False if __name__ == "__main__": import time start_time = time.time() # 使用示例 processor = VideoProcessor("./resource/videos/test.mp4") # 设置间隔为3秒提取帧 processor.process_video_pipeline( output_dir="output", interval_seconds=3.0, use_hw_accel=True ) end_time = time.time() print(f"处理完成!总耗时: {end_time - start_time:.2f} 秒") ================================================ FILE: config.example.toml ================================================ [app] project_version="0.7.6" # LLM API 超时配置(秒) llm_vision_timeout = 120 # 视觉模型基础超时时间 llm_text_timeout = 180 # 文本模型基础超时时间(解说文案生成等复杂任务需要更长时间) llm_max_retries = 3 # API 重试次数(LiteLLM 会自动处理重试) ########################################## # 🚀 LLM 配置 - 使用 LiteLLM 统一接口 ########################################## # LiteLLM 是统一的 LLM 接口库,支持 100+ providers # 优势: # ✅ 代码量减少 80%,统一的 API 接口 # ✅ 自动重试和智能错误处理 # ✅ 内置成本追踪和 token 统计 # ✅ 支持更多 providers:OpenAI, Anthropic, Gemini, Qwen, DeepSeek, # Cohere, Together AI, Replicate, Groq, Mistral 等 # # 文档:https://docs.litellm.ai/ # 支持的模型:https://docs.litellm.ai/docs/providers # ===== 视觉模型配置 ===== vision_llm_provider = "litellm" # 模型格式:provider/model_name # 常用视觉模型示例: # - Gemini: gemini/gemini-2.0-flash-lite (推荐,速度快成本低) # - Gemini: gemini/gemini-1.5-pro (高精度) # - OpenAI: gpt-4o, gpt-4o-mini # - Qwen: qwen/qwen2.5-vl-32b-instruct # - SiliconFlow: siliconflow/Qwen/Qwen2.5-VL-32B-Instruct vision_litellm_model_name = "gemini/gemini-2.0-flash-lite" vision_litellm_api_key = "" # 填入对应 provider 的 API key vision_litellm_base_url = "" # 可选:自定义 API base URL # ===== 文本模型配置 ===== text_llm_provider = "litellm" # 常用文本模型示例: # - DeepSeek: deepseek/deepseek-chat (推荐,性价比高) # - DeepSeek: deepseek/deepseek-reasoner (推理能力强) # - Gemini: gemini/gemini-2.0-flash (速度快) # - OpenAI: gpt-4o, gpt-4o-mini, gpt-4-turbo # - Qwen: qwen/qwen-plus, qwen/qwen-turbo # - SiliconFlow: siliconflow/deepseek-ai/DeepSeek-R1 # - Moonshot: moonshot/moonshot-v1-8k text_litellm_model_name = "deepseek/deepseek-chat" text_litellm_api_key = "" # 填入对应 provider 的 API key text_litellm_base_url = "" # 可选:自定义 API base URL # ===== API Keys 参考 ===== # 主流 LLM Providers API Key 获取地址: # # OpenAI: https://platform.openai.com/api-keys # Gemini: https://makersuite.google.com/app/apikey # DeepSeek: https://platform.deepseek.com/api_keys # Qwen (阿里): https://bailian.console.aliyun.com/?tab=model#/api-key # SiliconFlow: https://cloud.siliconflow.cn/account/ak (手机号注册) # Moonshot: https://platform.moonshot.cn/console/api-keys # Anthropic: https://console.anthropic.com/settings/keys # Cohere: https://dashboard.cohere.com/api-keys # Together AI: https://api.together.xyz/settings/api-keys ########################################## # 🔧 高级配置(可选) ########################################## # WebUI 界面是否显示配置项 hide_config = true ########################################## # 📚 传统配置示例(仅供参考,不推荐使用) ########################################## # 如果需要使用传统的单独 provider 实现,可以参考以下配置 # 但强烈推荐使用上面的 LiteLLM 配置 # # 传统视觉模型配置示例: # vision_llm_provider = "gemini" # 可选:gemini, qwenvl, siliconflow # vision_gemini_api_key = "" # vision_gemini_model_name = "gemini-2.0-flash-lite" # # 传统文本模型配置示例: # text_llm_provider = "openai" # 可选:openai, gemini, qwen, deepseek, siliconflow, moonshot # text_openai_api_key = "" # text_openai_model_name = "gpt-4o-mini" # text_openai_base_url = "https://api.openai.com/v1" ########################################## # TTS (文本转语音) 配置 ########################################## [azure] # Azure TTS 配置 # 获取密钥:https://portal.azure.com speech_key = "" speech_region = "" [tencent] # 腾讯云 TTS 配置 # 访问 https://console.cloud.tencent.com/cam/capi 获取密钥 secret_id = "" secret_key = "" region = "ap-beijing" # 地域配置 [soulvoice] # SoulVoice TTS API 配置 api_key = "" voice_uri = "speech:mcg3fdnx:clzkyf4vy00e5qr6hywum4u84:bzznlkuhcjzpbosexitr" api_url = "https://tts.scsmtech.cn/tts" model = "FunAudioLLM/CosyVoice2-0.5B" [tts_qwen] # 通义千问 Qwen3 TTS 配置 # 访问 https://bailian.console.aliyun.com/?tab=model#/api-key 获取你的 API 密钥 api_key = "" model_name = "qwen3-tts-flash" [indextts2] # IndexTTS2 语音克隆配置 # 这是一个开源的零样本语音克隆项目,需要自行部署 # 项目地址:https://github.com/index-tts/index-tts # 默认 API 地址(本地部署) api_url = "http://127.0.0.1:8081/tts" # 默认参考音频路径(可选) # reference_audio = "/path/to/reference_audio.wav" # 推理模式:普通推理 / 快速推理 infer_mode = "普通推理" # 高级参数 temperature = 1.0 top_p = 0.8 top_k = 30 do_sample = true num_beams = 3 repetition_penalty = 10.0 [ui] # TTS引擎选择 (edge_tts, azure_speech, soulvoice, tencent_tts, tts_qwen) tts_engine = "edge_tts" # Edge TTS 配置 edge_voice_name = "zh-CN-XiaoyiNeural-Female" edge_volume = 80 edge_rate = 1.0 edge_pitch = 0 # Azure Speech Services 配置 azure_voice_name = "zh-CN-XiaoyiNeural-Female" azure_volume = 80 azure_rate = 1.0 azure_pitch = 0 ########################################## # 代理和网络配置 ########################################## [proxy] # HTTP/HTTPS 代理配置(如需要) # clash 默认地址:http://127.0.0.1:7890 http = "" https = "" enabled = false ########################################## # 视频处理配置 ########################################## [frames] # 提取关键帧的间隔时间(秒) frame_interval_input = 3 # 大模型单次处理的关键帧数量 vision_batch_size = 10 ================================================ FILE: docker-compose.yml ================================================ services: narratoai-webui: build: context: . dockerfile: Dockerfile image: narratoai:latest container_name: narratoai-webui ports: - "8501:8501" volumes: - ./storage:/NarratoAI/storage - ./config.toml:/NarratoAI/config.toml - ./resource:/NarratoAI/resource:rw environment: - PYTHONUNBUFFERED=1 - TZ=Asia/Shanghai restart: unless-stopped # 健康检查 healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"] interval: 30s timeout: 10s retries: 3 start_period: 60s ================================================ FILE: docker-deploy.sh ================================================ #!/bin/bash # NarratoAI Docker 一键部署脚本 set -e # 颜色定义 GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' NC='\033[0m' # 日志函数 log_info() { echo -e "${GREEN}[INFO]${NC} $1" } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } # 显示帮助信息 show_help() { cat << EOF NarratoAI Docker 一键部署脚本 使用方法: $0 [选项] 选项: -h, --help 显示此帮助信息 -b, --build 强制重新构建镜像 --no-cache 构建时不使用缓存 示例: $0 # 标准部署 $0 -b # 重新构建并部署 $0 --no-cache # 无缓存构建 EOF } # 检查系统要求 check_requirements() { log_info "检查系统要求..." if ! command -v docker &> /dev/null; then log_error "Docker 未安装,请先安装 Docker" exit 1 fi if ! command -v docker-compose &> /dev/null && ! docker compose version &> /dev/null; then log_error "Docker Compose 未安装,请先安装 Docker Compose" exit 1 fi if ! docker info &> /dev/null; then log_error "Docker 服务未运行,请启动 Docker" exit 1 fi } # 检查配置文件 check_config() { if [ ! -f "config.toml" ]; then if [ -f "config.example.toml" ]; then log_warning "config.toml 不存在,复制示例配置文件" cp config.example.toml config.toml log_info "请编辑 config.toml 文件配置您的 API 密钥" else log_error "未找到配置文件模板" exit 1 fi fi } # 构建镜像 build_image() { log_info "构建 Docker 镜像..." local build_args="" if [ "$NO_CACHE" = "true" ]; then build_args="--no-cache" fi docker-compose build $build_args } # 启动服务 start_services() { log_info "启动 NarratoAI 服务..." docker-compose down 2>/dev/null || true docker-compose up -d } # 等待服务就绪 wait_for_service() { log_info "等待服务就绪..." local max_attempts=30 local attempt=1 while [ $attempt -le $max_attempts ]; do if curl -f http://localhost:8501/_stcore/health &>/dev/null; then log_info "服务已就绪" return 0 fi sleep 2 ((attempt++)) done log_warning "服务启动超时,请检查日志" return 1 } # 显示部署信息 show_deployment_info() { echo log_info "NarratoAI 部署完成!" echo "访问地址: http://localhost:8501" echo echo "常用命令:" echo " 查看日志: docker-compose logs -f" echo " 停止服务: docker-compose down" echo " 重启服务: docker-compose restart" } # 主函数 main() { FORCE_BUILD=false NO_CACHE=false # 解析命令行参数 while [[ $# -gt 0 ]]; do case $1 in -h|--help) show_help exit 0 ;; -b|--build) FORCE_BUILD=true shift ;; --no-cache) NO_CACHE=true shift ;; *) log_error "未知选项: $1" show_help exit 1 ;; esac done # 执行部署流程 log_info "开始 NarratoAI Docker 部署..." check_requirements check_config if [ "$FORCE_BUILD" = "true" ] || ! docker images | grep -q "narratoai"; then build_image fi start_services if wait_for_service; then show_deployment_info else log_error "部署失败,请检查日志" docker-compose logs --tail=20 exit 1 fi } # 执行主函数 main "$@" ================================================ FILE: docker-entrypoint.sh ================================================ #!/bin/bash set -e # 函数:打印日志 log() { echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" } # 函数:安装运行时依赖 install_runtime_dependencies() { log "检查并安装运行时依赖..." # 检查是否需要安装新的依赖 local requirements_file="requirements.txt" local installed_packages_file="/tmp/installed_packages.txt" # 如果requirements.txt存在且比已安装包列表新,则重新安装 if [ -f "$requirements_file" ]; then if [ ! -f "$installed_packages_file" ] || [ "$requirements_file" -nt "$installed_packages_file" ]; then log "发现新的依赖需求,开始安装..." # 尝试使用sudo安装,如果失败则使用用户级安装 if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then log "尝试使用sudo安装依赖..." sudo pip install --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do log "pip: $line" done INSTALL_RESULT=${PIPESTATUS[0]} else INSTALL_RESULT=1 # 设置为失败,触发用户级安装 fi # 如果sudo安装失败,尝试用户级安装 if [ $INSTALL_RESULT -ne 0 ]; then log "尝试用户级安装依赖..." pip install --user --no-cache-dir -r "$requirements_file" 2>&1 | while read line; do log "pip: $line" done # 确保用户级安装的包在PATH中 export PATH="$HOME/.local/bin:$PATH" fi # 单独安装腾讯云SDK(确保安装) log "确保腾讯云SDK已安装..." if ! pip list | grep -q "tencentcloud-sdk-python"; then log "安装腾讯云SDK..." pip install --user tencentcloud-sdk-python>=3.0.1200 else log "腾讯云SDK已安装" fi # 记录安装时间 touch "$installed_packages_file" log "依赖安装完成" else log "依赖已是最新版本,跳过安装" fi else log "未找到 requirements.txt 文件" fi } # 函数:检查必要的文件和目录 check_requirements() { log "检查应用环境..." # 检查配置文件 if [ ! -f "config.toml" ]; then if [ -f "config.example.toml" ]; then log "复制示例配置文件..." cp config.example.toml config.toml else log "警告: 未找到配置文件" fi fi # 检查必要的目录 for dir in "storage/temp" "storage/tasks" "storage/json" "storage/narration_scripts" "storage/drama_analysis"; do if [ ! -d "$dir" ]; then log "创建目录: $dir" mkdir -p "$dir" fi done # 安装运行时依赖 install_runtime_dependencies log "环境检查完成" } # 函数:启动 WebUI start_webui() { log "启动 NarratoAI WebUI..." # 检查端口是否可用 if command -v netstat >/dev/null 2>&1; then if netstat -tuln | grep -q ":8501 "; then log "警告: 端口 8501 已被占用" fi fi # 启动 Streamlit 应用 exec streamlit run webui.py \ --server.address=0.0.0.0 \ --server.port=8501 \ --server.enableCORS=true \ --server.maxUploadSize=2048 \ --server.enableXsrfProtection=false \ --browser.gatherUsageStats=false \ --browser.serverAddress=0.0.0.0 \ --logger.level=info } # 主逻辑 log "NarratoAI Docker 容器启动中..." # 检查环境 check_requirements # 根据参数执行不同的命令 case "$1" in "webui"|"") start_webui ;; "bash"|"sh") log "启动交互式 shell..." exec /bin/bash ;; "health") # 健康检查命令 log "执行健康检查..." if curl -f http://localhost:8501/_stcore/health >/dev/null 2>&1; then log "健康检查通过" exit 0 else log "健康检查失败" exit 1 fi ;; *) log "执行自定义命令: $*" exec "$@" ;; esac ================================================ FILE: docs/voice-list.txt ================================================ Name: af-ZA-AdriNeural Gender: Female Name: af-ZA-WillemNeural Gender: Male Name: am-ET-AmehaNeural Gender: Male Name: am-ET-MekdesNeural Gender: Female Name: ar-AE-FatimaNeural Gender: Female Name: ar-AE-HamdanNeural Gender: Male Name: ar-BH-AliNeural Gender: Male Name: ar-BH-LailaNeural Gender: Female Name: ar-DZ-AminaNeural Gender: Female Name: ar-DZ-IsmaelNeural Gender: Male Name: ar-EG-SalmaNeural Gender: Female Name: ar-EG-ShakirNeural Gender: Male Name: ar-IQ-BasselNeural Gender: Male Name: ar-IQ-RanaNeural Gender: Female Name: ar-JO-SanaNeural Gender: Female Name: ar-JO-TaimNeural Gender: Male Name: ar-KW-FahedNeural Gender: Male Name: ar-KW-NouraNeural Gender: Female Name: ar-LB-LaylaNeural Gender: Female Name: ar-LB-RamiNeural Gender: Male Name: ar-LY-ImanNeural Gender: Female Name: ar-LY-OmarNeural Gender: Male Name: ar-MA-JamalNeural Gender: Male Name: ar-MA-MounaNeural Gender: Female Name: ar-OM-AbdullahNeural Gender: Male Name: ar-OM-AyshaNeural Gender: Female Name: ar-QA-AmalNeural Gender: Female Name: ar-QA-MoazNeural Gender: Male Name: ar-SA-HamedNeural Gender: Male Name: ar-SA-ZariyahNeural Gender: Female Name: ar-SY-AmanyNeural Gender: Female Name: ar-SY-LaithNeural Gender: Male Name: ar-TN-HediNeural Gender: Male Name: ar-TN-ReemNeural Gender: Female Name: ar-YE-MaryamNeural Gender: Female Name: ar-YE-SalehNeural Gender: Male Name: az-AZ-BabekNeural Gender: Male Name: az-AZ-BanuNeural Gender: Female Name: bg-BG-BorislavNeural Gender: Male Name: bg-BG-KalinaNeural Gender: Female Name: bn-BD-NabanitaNeural Gender: Female Name: bn-BD-PradeepNeural Gender: Male Name: bn-IN-BashkarNeural Gender: Male Name: bn-IN-TanishaaNeural Gender: Female Name: bs-BA-GoranNeural Gender: Male Name: bs-BA-VesnaNeural Gender: Female Name: ca-ES-EnricNeural Gender: Male Name: ca-ES-JoanaNeural Gender: Female Name: cs-CZ-AntoninNeural Gender: Male Name: cs-CZ-VlastaNeural Gender: Female Name: cy-GB-AledNeural Gender: Male Name: cy-GB-NiaNeural Gender: Female Name: da-DK-ChristelNeural Gender: Female Name: da-DK-JeppeNeural Gender: Male Name: de-AT-IngridNeural Gender: Female Name: de-AT-JonasNeural Gender: Male Name: de-CH-JanNeural Gender: Male Name: de-CH-LeniNeural Gender: Female Name: de-DE-AmalaNeural Gender: Female Name: de-DE-ConradNeural Gender: Male Name: de-DE-FlorianMultilingualNeural Gender: Male Name: de-DE-KatjaNeural Gender: Female Name: de-DE-KillianNeural Gender: Male Name: de-DE-SeraphinaMultilingualNeural Gender: Female Name: el-GR-AthinaNeural Gender: Female Name: el-GR-NestorasNeural Gender: Male Name: en-AU-NatashaNeural Gender: Female Name: en-AU-WilliamNeural Gender: Male Name: en-CA-ClaraNeural Gender: Female Name: en-CA-LiamNeural Gender: Male Name: en-GB-LibbyNeural Gender: Female Name: en-GB-MaisieNeural Gender: Female Name: en-GB-RyanNeural Gender: Male Name: en-GB-SoniaNeural Gender: Female Name: en-GB-ThomasNeural Gender: Male Name: en-HK-SamNeural Gender: Male Name: en-HK-YanNeural Gender: Female Name: en-IE-ConnorNeural Gender: Male Name: en-IE-EmilyNeural Gender: Female Name: en-IN-NeerjaExpressiveNeural Gender: Female Name: en-IN-NeerjaNeural Gender: Female Name: en-IN-PrabhatNeural Gender: Male Name: en-KE-AsiliaNeural Gender: Female Name: en-KE-ChilembaNeural Gender: Male Name: en-NG-AbeoNeural Gender: Male Name: en-NG-EzinneNeural Gender: Female Name: en-NZ-MitchellNeural Gender: Male Name: en-NZ-MollyNeural Gender: Female Name: en-PH-JamesNeural Gender: Male Name: en-PH-RosaNeural Gender: Female Name: en-SG-LunaNeural Gender: Female Name: en-SG-WayneNeural Gender: Male Name: en-TZ-ElimuNeural Gender: Male Name: en-TZ-ImaniNeural Gender: Female Name: en-US-AnaNeural Gender: Female Name: en-US-AndrewNeural Gender: Male Name: en-US-AriaNeural Gender: Female Name: en-US-AvaNeural Gender: Female Name: en-US-BrianNeural Gender: Male Name: en-US-ChristopherNeural Gender: Male Name: en-US-EmmaNeural Gender: Female Name: en-US-EricNeural Gender: Male Name: en-US-GuyNeural Gender: Male Name: en-US-JennyNeural Gender: Female Name: en-US-MichelleNeural Gender: Female Name: en-US-RogerNeural Gender: Male Name: en-US-SteffanNeural Gender: Male Name: en-ZA-LeahNeural Gender: Female Name: en-ZA-LukeNeural Gender: Male Name: es-AR-ElenaNeural Gender: Female Name: es-AR-TomasNeural Gender: Male Name: es-BO-MarceloNeural Gender: Male Name: es-BO-SofiaNeural Gender: Female Name: es-CL-CatalinaNeural Gender: Female Name: es-CL-LorenzoNeural Gender: Male Name: es-CO-GonzaloNeural Gender: Male Name: es-CO-SalomeNeural Gender: Female Name: es-CR-JuanNeural Gender: Male Name: es-CR-MariaNeural Gender: Female Name: es-CU-BelkysNeural Gender: Female Name: es-CU-ManuelNeural Gender: Male Name: es-DO-EmilioNeural Gender: Male Name: es-DO-RamonaNeural Gender: Female Name: es-EC-AndreaNeural Gender: Female Name: es-EC-LuisNeural Gender: Male Name: es-ES-AlvaroNeural Gender: Male Name: es-ES-ElviraNeural Gender: Female Name: es-ES-XimenaNeural Gender: Female Name: es-GQ-JavierNeural Gender: Male Name: es-GQ-TeresaNeural Gender: Female Name: es-GT-AndresNeural Gender: Male Name: es-GT-MartaNeural Gender: Female Name: es-HN-CarlosNeural Gender: Male Name: es-HN-KarlaNeural Gender: Female Name: es-MX-DaliaNeural Gender: Female Name: es-MX-JorgeNeural Gender: Male Name: es-NI-FedericoNeural Gender: Male Name: es-NI-YolandaNeural Gender: Female Name: es-PA-MargaritaNeural Gender: Female Name: es-PA-RobertoNeural Gender: Male Name: es-PE-AlexNeural Gender: Male Name: es-PE-CamilaNeural Gender: Female Name: es-PR-KarinaNeural Gender: Female Name: es-PR-VictorNeural Gender: Male Name: es-PY-MarioNeural Gender: Male Name: es-PY-TaniaNeural Gender: Female Name: es-SV-LorenaNeural Gender: Female Name: es-SV-RodrigoNeural Gender: Male Name: es-US-AlonsoNeural Gender: Male Name: es-US-PalomaNeural Gender: Female Name: es-UY-MateoNeural Gender: Male Name: es-UY-ValentinaNeural Gender: Female Name: es-VE-PaolaNeural Gender: Female Name: es-VE-SebastianNeural Gender: Male Name: et-EE-AnuNeural Gender: Female Name: et-EE-KertNeural Gender: Male Name: fa-IR-DilaraNeural Gender: Female Name: fa-IR-FaridNeural Gender: Male Name: fi-FI-HarriNeural Gender: Male Name: fi-FI-NooraNeural Gender: Female Name: fil-PH-AngeloNeural Gender: Male Name: fil-PH-BlessicaNeural Gender: Female Name: fr-BE-CharlineNeural Gender: Female Name: fr-BE-GerardNeural Gender: Male Name: fr-CA-AntoineNeural Gender: Male Name: fr-CA-JeanNeural Gender: Male Name: fr-CA-SylvieNeural Gender: Female Name: fr-CA-ThierryNeural Gender: Male Name: fr-CH-ArianeNeural Gender: Female Name: fr-CH-FabriceNeural Gender: Male Name: fr-FR-DeniseNeural Gender: Female Name: fr-FR-EloiseNeural Gender: Female Name: fr-FR-HenriNeural Gender: Male Name: fr-FR-RemyMultilingualNeural Gender: Male Name: fr-FR-VivienneMultilingualNeural Gender: Female Name: ga-IE-ColmNeural Gender: Male Name: ga-IE-OrlaNeural Gender: Female Name: gl-ES-RoiNeural Gender: Male Name: gl-ES-SabelaNeural Gender: Female Name: gu-IN-DhwaniNeural Gender: Female Name: gu-IN-NiranjanNeural Gender: Male Name: he-IL-AvriNeural Gender: Male Name: he-IL-HilaNeural Gender: Female Name: hi-IN-MadhurNeural Gender: Male Name: hi-IN-SwaraNeural Gender: Female Name: hr-HR-GabrijelaNeural Gender: Female Name: hr-HR-SreckoNeural Gender: Male Name: hu-HU-NoemiNeural Gender: Female Name: hu-HU-TamasNeural Gender: Male Name: id-ID-ArdiNeural Gender: Male Name: id-ID-GadisNeural Gender: Female Name: is-IS-GudrunNeural Gender: Female Name: is-IS-GunnarNeural Gender: Male Name: it-IT-DiegoNeural Gender: Male Name: it-IT-ElsaNeural Gender: Female Name: it-IT-GiuseppeNeural Gender: Male Name: it-IT-IsabellaNeural Gender: Female Name: ja-JP-KeitaNeural Gender: Male Name: ja-JP-NanamiNeural Gender: Female Name: jv-ID-DimasNeural Gender: Male Name: jv-ID-SitiNeural Gender: Female Name: ka-GE-EkaNeural Gender: Female Name: ka-GE-GiorgiNeural Gender: Male Name: kk-KZ-AigulNeural Gender: Female Name: kk-KZ-DauletNeural Gender: Male Name: km-KH-PisethNeural Gender: Male Name: km-KH-SreymomNeural Gender: Female Name: kn-IN-GaganNeural Gender: Male Name: kn-IN-SapnaNeural Gender: Female Name: ko-KR-HyunsuNeural Gender: Male Name: ko-KR-InJoonNeural Gender: Male Name: ko-KR-SunHiNeural Gender: Female Name: lo-LA-ChanthavongNeural Gender: Male Name: lo-LA-KeomanyNeural Gender: Female Name: lt-LT-LeonasNeural Gender: Male Name: lt-LT-OnaNeural Gender: Female Name: lv-LV-EveritaNeural Gender: Female Name: lv-LV-NilsNeural Gender: Male Name: mk-MK-AleksandarNeural Gender: Male Name: mk-MK-MarijaNeural Gender: Female Name: ml-IN-MidhunNeural Gender: Male Name: ml-IN-SobhanaNeural Gender: Female Name: mn-MN-BataaNeural Gender: Male Name: mn-MN-YesuiNeural Gender: Female Name: mr-IN-AarohiNeural Gender: Female Name: mr-IN-ManoharNeural Gender: Male Name: ms-MY-OsmanNeural Gender: Male Name: ms-MY-YasminNeural Gender: Female Name: mt-MT-GraceNeural Gender: Female Name: mt-MT-JosephNeural Gender: Male Name: my-MM-NilarNeural Gender: Female Name: my-MM-ThihaNeural Gender: Male Name: nb-NO-FinnNeural Gender: Male Name: nb-NO-PernilleNeural Gender: Female Name: ne-NP-HemkalaNeural Gender: Female Name: ne-NP-SagarNeural Gender: Male Name: nl-BE-ArnaudNeural Gender: Male Name: nl-BE-DenaNeural Gender: Female Name: nl-NL-ColetteNeural Gender: Female Name: nl-NL-FennaNeural Gender: Female Name: nl-NL-MaartenNeural Gender: Male Name: pl-PL-MarekNeural Gender: Male Name: pl-PL-ZofiaNeural Gender: Female Name: ps-AF-GulNawazNeural Gender: Male Name: ps-AF-LatifaNeural Gender: Female Name: pt-BR-AntonioNeural Gender: Male Name: pt-BR-FranciscaNeural Gender: Female Name: pt-BR-ThalitaNeural Gender: Female Name: pt-PT-DuarteNeural Gender: Male Name: pt-PT-RaquelNeural Gender: Female Name: ro-RO-AlinaNeural Gender: Female Name: ro-RO-EmilNeural Gender: Male Name: ru-RU-DmitryNeural Gender: Male Name: ru-RU-SvetlanaNeural Gender: Female Name: si-LK-SameeraNeural Gender: Male Name: si-LK-ThiliniNeural Gender: Female Name: sk-SK-LukasNeural Gender: Male Name: sk-SK-ViktoriaNeural Gender: Female Name: sl-SI-PetraNeural Gender: Female Name: sl-SI-RokNeural Gender: Male Name: so-SO-MuuseNeural Gender: Male Name: so-SO-UbaxNeural Gender: Female Name: sq-AL-AnilaNeural Gender: Female Name: sq-AL-IlirNeural Gender: Male Name: sr-RS-NicholasNeural Gender: Male Name: sr-RS-SophieNeural Gender: Female Name: su-ID-JajangNeural Gender: Male Name: su-ID-TutiNeural Gender: Female Name: sv-SE-MattiasNeural Gender: Male Name: sv-SE-SofieNeural Gender: Female Name: sw-KE-RafikiNeural Gender: Male Name: sw-KE-ZuriNeural Gender: Female Name: sw-TZ-DaudiNeural Gender: Male Name: sw-TZ-RehemaNeural Gender: Female Name: ta-IN-PallaviNeural Gender: Female Name: ta-IN-ValluvarNeural Gender: Male Name: ta-LK-KumarNeural Gender: Male Name: ta-LK-SaranyaNeural Gender: Female Name: ta-MY-KaniNeural Gender: Female Name: ta-MY-SuryaNeural Gender: Male Name: ta-SG-AnbuNeural Gender: Male Name: ta-SG-VenbaNeural Gender: Female Name: te-IN-MohanNeural Gender: Male Name: te-IN-ShrutiNeural Gender: Female Name: th-TH-NiwatNeural Gender: Male Name: th-TH-PremwadeeNeural Gender: Female Name: tr-TR-AhmetNeural Gender: Male Name: tr-TR-EmelNeural Gender: Female Name: uk-UA-OstapNeural Gender: Male Name: uk-UA-PolinaNeural Gender: Female Name: ur-IN-GulNeural Gender: Female Name: ur-IN-SalmanNeural Gender: Male Name: ur-PK-AsadNeural Gender: Male Name: ur-PK-UzmaNeural Gender: Female Name: uz-UZ-MadinaNeural Gender: Female Name: uz-UZ-SardorNeural Gender: Male Name: vi-VN-HoaiMyNeural Gender: Female Name: vi-VN-NamMinhNeural Gender: Male Name: zh-CN-XiaoxiaoNeural Gender: Female Name: zh-CN-XiaoyiNeural Gender: Female Name: zh-CN-YunjianNeural Gender: Male Name: zh-CN-YunxiNeural Gender: Male Name: zh-CN-YunxiaNeural Gender: Male Name: zh-CN-YunyangNeural Gender: Male Name: zh-CN-liaoning-XiaobeiNeural Gender: Female Name: zh-CN-shaanxi-XiaoniNeural Gender: Female Name: zh-HK-HiuGaaiNeural Gender: Female Name: zh-HK-HiuMaanNeural Gender: Female Name: zh-HK-WanLungNeural Gender: Male Name: zh-TW-HsiaoChenNeural Gender: Female Name: zh-TW-HsiaoYuNeural Gender: Female Name: zh-TW-YunJheNeural Gender: Male Name: zu-ZA-ThandoNeural Gender: Female Name: zu-ZA-ThembaNeural Gender: Male ================================================ FILE: project_version ================================================ 0.7.6 ================================================ FILE: requirements.txt ================================================ # 核心依赖 requests>=2.32.0 moviepy==2.1.1 edge-tts==7.2.7 streamlit>=1.45.0 watchdog==6.0.0 loguru>=0.7.3 tomli>=2.2.1 tomli-w>=1.0.0 pydub==0.25.1 pysrt==1.1.2 # AI 服务依赖 openai>=1.77.0 litellm>=1.70.0 # 统一的 LLM 接口,支持 100+ providers google-generativeai>=0.8.5 # LiteLLM 会使用此库调用 Gemini azure-cognitiveservices-speech>=1.37.0 tencentcloud-sdk-python>=3.0.1200 dashscope>=1.24.6 # 图像处理依赖 Pillow>=10.3.0 # 进度条和重试机制 tqdm>=4.66.6 tenacity>=9.0.0 # 可选依赖(根据功能需要) # 如果需要本地语音识别,取消注释下面的行 # faster-whisper>=1.0.1 # 如果需要 OpenCV 图像处理,取消注释下面的行 # opencv-python>=4.11.0.86 # 如果需要 CUDA 支持,取消注释下面的行 # torch>=2.0.0 # torchvision>=0.15.0 # torchaudio>=2.0.0 ================================================ FILE: resource/fonts/fonts_in_here.txt ================================================ 此处放字体文件 ================================================ FILE: resource/public/index.html ================================================ NarratoAI

NarratoAI

项目地址:https://github.com/linyqh/NarratoAI
webui 地址:http://127.0.0.1:8501
api swagger 地址:http://127.0.0.1:8080/docs

NarratoAI 是一个自动化影视解说工具,基于LLM实现文案撰写、自动化视频剪辑、配音和字幕生成的一站式流程,助力高效内容创作。

NarratoAI is an automated film and television commentary tool that implements a one-stop process of copywriting, automated video editing, dubbing and subtitle generation based on LLM, facilitating efficient content creation.

================================================ FILE: resource/scripts/script_in_here.txt ================================================ ================================================ FILE: resource/songs/song_in_here.txt ================================================ ================================================ FILE: resource/srt/srt_in_here.txt ================================================ ================================================ FILE: resource/videos/video_in_here.txt ================================================ ================================================ FILE: webui/__init__.py ================================================ """ NarratoAI WebUI Package """ from webui.config.settings import config from webui.components import ( basic_settings, video_settings, audio_settings, subtitle_settings ) from webui.utils import cache, file_utils __all__ = [ 'config', 'basic_settings', 'video_settings', 'audio_settings', 'subtitle_settings', 'cache', 'file_utils' ] ================================================ FILE: webui/components/__init__.py ================================================ from .basic_settings import render_basic_settings from .script_settings import render_script_panel from .video_settings import render_video_panel from .audio_settings import render_audio_panel from .subtitle_settings import render_subtitle_panel __all__ = [ 'render_basic_settings', 'render_script_panel', 'render_video_panel', 'render_audio_panel', 'render_subtitle_panel' ] ================================================ FILE: webui/components/audio_settings.py ================================================ import streamlit as st import os from uuid import uuid4 from app.config import config from app.services import voice from app.models.schema import AudioVolumeDefaults from app.utils import utils from webui.utils.cache import get_songs_cache def get_soulvoice_voices(): """获取 SoulVoice 语音列表""" # 检查是否配置了 SoulVoice API key api_key = config.soulvoice.get("api_key", "") if not api_key: return [] # 只返回一个 SoulVoice 选项,音色通过输入框自定义 return ["soulvoice:custom"] def get_tts_engine_options(): """获取TTS引擎选项""" return { "edge_tts": "Edge TTS", "azure_speech": "Azure Speech Services", "tencent_tts": "腾讯云 TTS", "qwen3_tts": "通义千问 Qwen3 TTS", "indextts2": "IndexTTS2 语音克隆" } def get_tts_engine_descriptions(): """获取TTS引擎详细描述""" return { "edge_tts": { "title": "Edge TTS", "features": "完全免费,但服务稳定性一般,不支持语音克隆功能", "use_case": "测试和轻量级使用", "registration": None }, "azure_speech": { "title": "Azure Speech Services", "features": "提供一定免费额度,超出后按量付费,需要绑定海外信用卡", "use_case": "企业级应用,需要稳定服务", "registration": "https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices" }, "tencent_tts": { "title": "腾讯云 TTS", "features": "提供免费额度,音质优秀,支持多种音色,国内访问速度快", "use_case": "个人和企业用户,需要稳定的中文语音合成", "registration": "https://console.cloud.tencent.com/tts" }, "qwen3_tts": { "title": "通义千问 Qwen3 TTS", "features": "阿里云通义千问语音合成,音质优秀,支持多种音色", "use_case": "需要高质量中文语音合成的用户", "registration": "https://dashscope.aliyuncs.com/" }, "indextts2": { "title": "IndexTTS2 语音克隆", "features": "零样本语音克隆,上传参考音频即可合成相同音色的语音,需要本地或私有部署", "use_case": "下载地址:https://pan.quark.cn/s/0767c9bcefd5", "registration": None } } def is_valid_azure_voice_name(voice_name: str) -> bool: """检查是否为有效的Azure音色名称格式""" if not voice_name or not isinstance(voice_name, str): return False voice_name = voice_name.strip() # Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural # 例如: zh-CN-YunzeNeural, en-US-AvaMultilingualNeural import re pattern = r'^[a-z]{2}-[A-Z]{2}-\w+Neural$' return bool(re.match(pattern, voice_name)) def render_audio_panel(tr): """渲染音频设置面板""" with st.container(border=True): st.write(tr("Audio Settings")) # 渲染TTS设置 render_tts_settings(tr) # 渲染背景音乐设置 render_bgm_settings(tr) def render_tts_settings(tr): """渲染TTS(文本转语音)设置""" # 1. TTS引擎选择器 # st.subheader("🎤 TTS引擎选择") engine_options = get_tts_engine_options() engine_descriptions = get_tts_engine_descriptions() # 获取保存的TTS引擎设置 saved_tts_engine = config.ui.get("tts_engine", "edge_tts") # 确保保存的引擎在可用选项中 if saved_tts_engine not in engine_options: saved_tts_engine = "edge_tts" # TTS引擎选择下拉框 selected_engine = st.selectbox( "选择TTS引擎", options=list(engine_options.keys()), format_func=lambda x: engine_options[x], index=list(engine_options.keys()).index(saved_tts_engine), help="选择您要使用的文本转语音引擎" ) # 保存TTS引擎选择 config.ui["tts_engine"] = selected_engine st.session_state['tts_engine'] = selected_engine # 2. 显示引擎详细说明 if selected_engine in engine_descriptions: desc = engine_descriptions[selected_engine] with st.expander(f"📋 {desc['title']} 详细说明", expanded=True): st.markdown(f"**特点:** {desc['features']}") st.markdown(f"**适用场景:** {desc['use_case']}") if desc['registration']: st.markdown(f"**注册地址:** [{desc['registration']}]({desc['registration']})") # 3. 根据选择的引擎渲染对应的配置界面 # st.subheader("⚙️ 引擎配置") if selected_engine == "edge_tts": render_edge_tts_settings(tr) elif selected_engine == "azure_speech": render_azure_speech_settings(tr) elif selected_engine == "soulvoice": render_soulvoice_engine_settings(tr) elif selected_engine == "tencent_tts": render_tencent_tts_settings(tr) elif selected_engine == "qwen3_tts": render_qwen3_tts_settings(tr) elif selected_engine == "indextts2": render_indextts2_tts_settings(tr) # 4. 试听功能 render_voice_preview_new(tr, selected_engine) def render_edge_tts_settings(tr): """渲染 Edge TTS 引擎设置""" # 获取支持的语音列表 support_locales = ["zh-CN", "en-US"] all_voices = voice.get_all_azure_voices(filter_locals=support_locales) # 只保留标准版本的语音(Edge TTS专用,不包含V2) edge_voices = [v for v in all_voices if "-V2" not in v] # 创建友好的显示名称 friendly_names = {} for v in edge_voices: friendly_names[v] = v.replace("Female", tr("Female")).replace("Male", tr("Male")).replace("Neural", "") # 获取保存的语音设置 saved_voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoxiaoNeural-Female") # 确保保存的音色在可用列表中 if saved_voice_name not in friendly_names: # 选择与UI语言匹配的第一个语音 for v in edge_voices: if v.lower().startswith(st.session_state.get("ui_language", "zh-CN").lower()): saved_voice_name = v break else: # 如果没找到匹配的,使用第一个 saved_voice_name = edge_voices[0] if edge_voices else "" # 音色选择下拉框(Edge TTS音色相对较少,保留下拉框) selected_friendly_name = st.selectbox( "音色选择", options=list(friendly_names.values()), index=list(friendly_names.keys()).index(saved_voice_name) if saved_voice_name in friendly_names else 0, help="选择Edge TTS音色" ) # 获取实际的语音名称 voice_name = list(friendly_names.keys())[ list(friendly_names.values()).index(selected_friendly_name) ] # 显示音色信息 with st.expander("💡 Edge TTS 音色说明", expanded=False): st.write("**中文音色:**") zh_voices = [v for v in edge_voices if v.startswith("zh-CN")] for v in zh_voices: gender = "女声" if "Female" in v else "男声" name = v.replace("-Female", "").replace("-Male", "").replace("zh-CN-", "").replace("Neural", "") st.write(f"• {name} ({gender})") st.write("") st.write("**英文音色:**") en_voices = [v for v in edge_voices if v.startswith("en-US")][:5] # 只显示前5个 for v in en_voices: gender = "女声" if "Female" in v else "男声" name = v.replace("-Female", "").replace("-Male", "").replace("en-US-", "").replace("Neural", "") st.write(f"• {name} ({gender})") if len([v for v in edge_voices if v.startswith("en-US")]) > 5: st.write("• ... 更多英文音色") config.ui["edge_voice_name"] = voice_name config.ui["voice_name"] = voice_name # 兼容性 # 音量调节 voice_volume = st.slider( "音量调节", min_value=0, max_value=100, value=int(config.ui.get("edge_volume", 80)), step=1, help="调节语音音量 (0-100)" ) config.ui["edge_volume"] = voice_volume st.session_state['voice_volume'] = voice_volume / 100.0 # 语速调节 voice_rate = st.slider( "语速调节", min_value=0.5, max_value=2.0, value=config.ui.get("edge_rate", 1.0), step=0.1, help="调节语音速度 (0.5-2.0倍速)" ) config.ui["edge_rate"] = voice_rate st.session_state['voice_rate'] = voice_rate # 语调调节 voice_pitch = st.slider( "语调调节", min_value=-50, max_value=50, value=int(config.ui.get("edge_pitch", 0)), step=5, help="调节语音音调 (-50%到+50%)" ) config.ui["edge_pitch"] = voice_pitch # 转换为比例值 st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0) def render_azure_speech_settings(tr): """渲染 Azure Speech Services 引擎设置""" # 服务区域配置 azure_speech_region = st.text_input( "服务区域", value=config.azure.get("speech_region", ""), placeholder="例如:eastus", help="Azure Speech Services 服务区域,如:eastus, westus2, eastasia 等" ) # API Key配置 azure_speech_key = st.text_input( "API Key", value=config.azure.get("speech_key", ""), type="password", help="Azure Speech Services API 密钥" ) # 保存Azure配置 config.azure["speech_region"] = azure_speech_region config.azure["speech_key"] = azure_speech_key # 音色名称输入框 saved_voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural") # 音色名称输入 voice_name = st.text_input( "音色名称", value=saved_voice_name, help="输入Azure Speech Services音色名称,直接使用官方音色名称即可。例如:zh-CN-YunzeNeural", placeholder="zh-CN-YunzeNeural" ) # 显示常用音色示例 with st.expander("💡 常用音色参考", expanded=False): st.write("**中文音色:**") st.write("• zh-CN-XiaoxiaoMultilingualNeural (女声,多语言)") st.write("• zh-CN-YunzeNeural (男声)") st.write("• zh-CN-YunxiNeural (男声)") st.write("• zh-CN-XiaochenNeural (女声)") st.write("") st.write("**英文音色:**") st.write("• en-US-AndrewMultilingualNeural (男声,多语言)") st.write("• en-US-AvaMultilingualNeural (女声,多语言)") st.write("• en-US-BrianMultilingualNeural (男声,多语言)") st.write("• en-US-EmmaMultilingualNeural (女声,多语言)") st.write("") st.info("💡 更多音色请参考 [Azure Speech Services 官方文档](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support)") # 快速选择按钮 st.write("**快速选择:**") cols = st.columns(3) with cols[0]: if st.button("中文女声", help="zh-CN-XiaoxiaoMultilingualNeural"): voice_name = "zh-CN-XiaoxiaoMultilingualNeural" st.rerun() with cols[1]: if st.button("中文男声", help="zh-CN-YunzeNeural"): voice_name = "zh-CN-YunzeNeural" st.rerun() with cols[2]: if st.button("英文女声", help="en-US-AvaMultilingualNeural"): voice_name = "en-US-AvaMultilingualNeural" st.rerun() # 验证音色名称并显示状态 if voice_name.strip(): # 检查是否为有效的Azure音色格式 if is_valid_azure_voice_name(voice_name): st.success(f"✅ 音色名称有效: {voice_name}") else: st.warning(f"⚠️ 音色名称格式可能不正确: {voice_name}") st.info("💡 Azure音色名称通常格式为: [语言]-[地区]-[名称]Neural") # 保存配置 config.ui["azure_voice_name"] = voice_name config.ui["voice_name"] = voice_name # 兼容性 # 音量调节 voice_volume = st.slider( "音量调节", min_value=0, max_value=100, value=int(config.ui.get("azure_volume", 80)), step=1, help="调节语音音量 (0-100)" ) config.ui["azure_volume"] = voice_volume st.session_state['voice_volume'] = voice_volume / 100.0 # 语速调节 voice_rate = st.slider( "语速调节", min_value=0.5, max_value=2.0, value=config.ui.get("azure_rate", 1.0), step=0.1, help="调节语音速度 (0.5-2.0倍速)" ) config.ui["azure_rate"] = voice_rate st.session_state['voice_rate'] = voice_rate # 语调调节 voice_pitch = st.slider( "语调调节", min_value=-50, max_value=50, value=int(config.ui.get("azure_pitch", 0)), step=5, help="调节语音音调 (-50%到+50%)" ) config.ui["azure_pitch"] = voice_pitch # 转换为比例值 st.session_state['voice_pitch'] = 1.0 + (voice_pitch / 100.0) # 显示配置状态 if azure_speech_region and azure_speech_key: st.success("✅ Azure Speech Services 配置已设置") elif not azure_speech_region: st.warning("⚠️ 请配置服务区域") elif not azure_speech_key: st.warning("⚠️ 请配置 API Key") def render_tencent_tts_settings(tr): """渲染腾讯云 TTS 引擎设置""" # Secret ID 输入 secret_id = st.text_input( "Secret ID", value=config.tencent.get("secret_id", ""), help="请输入您的腾讯云 Secret ID" ) # Secret Key 输入 secret_key = st.text_input( "Secret Key", value=config.tencent.get("secret_key", ""), type="password", help="请输入您的腾讯云 Secret Key" ) # 地域选择 region_options = [ "ap-beijing", "ap-shanghai", "ap-guangzhou", "ap-chengdu", "ap-nanjing", "ap-singapore", "ap-hongkong" ] saved_region = config.tencent.get("region", "ap-beijing") if saved_region not in region_options: region_options.append(saved_region) region = st.selectbox( "服务地域", options=region_options, index=region_options.index(saved_region), help="选择腾讯云 TTS 服务地域" ) # 音色选择 voice_type_options = { "101001": "智瑜 - 女声(推荐)", "101002": "智聆 - 女声", "101003": "智美 - 女声", "101004": "智云 - 男声", "101005": "智莉 - 女声", "101006": "智言 - 男声", "101007": "智娜 - 女声", "101008": "智琪 - 女声", "101009": "智芸 - 女声", "101010": "智华 - 男声", "101011": "智燕 - 女声", "101012": "智丹 - 女声", "101013": "智辉 - 男声", "101014": "智宁 - 女声", "101015": "智萌 - 女声", "101016": "智甜 - 女声", "101017": "智蓉 - 女声", "101018": "智靖 - 男声" } saved_voice_type = config.ui.get("tencent_voice_type", "101001") if saved_voice_type not in voice_type_options: voice_type_options[saved_voice_type] = f"自定义音色 ({saved_voice_type})" selected_voice_display = st.selectbox( "音色选择", options=list(voice_type_options.values()), index=list(voice_type_options.keys()).index(saved_voice_type), help="选择腾讯云 TTS 音色" ) # 获取实际的音色ID voice_type = list(voice_type_options.keys())[ list(voice_type_options.values()).index(selected_voice_display) ] # 语速调节 voice_rate = st.slider( "语速调节", min_value=0.5, max_value=2.0, value=config.ui.get("tencent_rate", 1.0), step=0.1, help="调节语音速度 (0.5-2.0)" ) config.ui["voice_name"] = saved_voice_type # 兼容性 # 显示音色说明 with st.expander("💡 腾讯云 TTS 音色说明", expanded=False): st.write("**女声音色:**") female_voices = [(k, v) for k, v in voice_type_options.items() if "女声" in v] for voice_id, voice_desc in female_voices[:6]: # 显示前6个 st.write(f"• {voice_desc} (ID: {voice_id})") st.write("") st.write("**男声音色:**") male_voices = [(k, v) for k, v in voice_type_options.items() if "男声" in v] for voice_id, voice_desc in male_voices: st.write(f"• {voice_desc} (ID: {voice_id})") st.write("") st.info("💡 更多音色请参考腾讯云官方文档") # 保存配置 config.tencent["secret_id"] = secret_id config.tencent["secret_key"] = secret_key config.tencent["region"] = region config.ui["tencent_voice_type"] = voice_type config.ui["tencent_rate"] = voice_rate config.ui["voice_name"] = saved_voice_type #兼容性 def render_qwen3_tts_settings(tr): """渲染 Qwen3 TTS 设置""" api_key = st.text_input( "API Key", value=config.tts_qwen.get("api_key", ""), type="password", help="通义千问 DashScope API Key" ) model_name = st.text_input( "模型名称", value=config.tts_qwen.get("model_name", "qwen3-tts-flash"), help="Qwen TTS 模型名,例如 qwen3-tts-flash" ) # Qwen3 TTS 音色选项 - 中文名: 英文参数 voice_options = { "芊悦": "Cherry", "晨煦": "Ethan", "不吃鱼": "Nofish", "詹妮弗": "Jennifer", "甜茶": "Ryan", "卡捷琳娜": "Katerina", "墨讲师": "Elias", "上海-阿珍": "Jada", "北京-晓东": "Dylan", "四川-晴儿": "Sunny", "南京-老李": "Li", "陕西-秦川": "Marcus", "闽南-阿杰": "Roy", "天津-李彼得": "Peter", "粤语-阿强": "Rocky", "粤语-阿清": "Kiki", "四川-程川": "Eric" } # 显示给用户的中文名称列表 display_names = list(voice_options.keys()) saved_voice_param = config.ui.get("qwen_voice_type", "Cherry") # 如果保存的英文参数不在选项中,查找对应的中文名称 saved_display_name = "芊悦" # 默认值 for chinese_name, english_param in voice_options.items(): if english_param == saved_voice_param: saved_display_name = chinese_name break # 如果保存的音色不在选项中,添加到自定义选项 if saved_display_name not in display_names: display_names.append(saved_display_name) voice_options[saved_display_name] = saved_voice_param selected_display_name = st.selectbox( "音色选择", options=display_names, index=display_names.index(saved_display_name) if saved_display_name in display_names else 0, help="选择Qwen3 TTS音色" ) # 获取对应的英文参数 voice_type = voice_options.get(selected_display_name, "Cherry") voice_rate = st.slider( "语速调节", min_value=0.5, max_value=2.0, value=1.0, step=0.1, help="调节语音速度 (0.5-2.0)" ) # 保存配置 config.tts_qwen["api_key"] = api_key config.tts_qwen["model_name"] = model_name config.ui["qwen_voice_type"] = voice_type config.ui["qwen3_rate"] = voice_rate config.ui["voice_name"] = voice_type #兼容性 def render_indextts2_tts_settings(tr): """渲染 IndexTTS2 TTS 设置""" import os # API 地址配置 api_url = st.text_input( "API 地址", value=config.indextts2.get("api_url", "http://127.0.0.1:8081/tts"), help="IndexTTS2 API 服务地址" ) # 参考音频文件路径 reference_audio = st.text_input( "参考音频路径", value=config.indextts2.get("reference_audio", ""), help="用于语音克隆的参考音频文件路径(WAV 格式,建议 3-10 秒)" ) # 文件上传功能 uploaded_file = st.file_uploader( "或上传参考音频文件", type=["wav", "mp3"], help="上传一段清晰的音频用于语音克隆" ) if uploaded_file is not None: # 保存上传的文件 import tempfile temp_dir = tempfile.gettempdir() audio_path = os.path.join(temp_dir, f"indextts2_ref_{uploaded_file.name}") with open(audio_path, "wb") as f: f.write(uploaded_file.getbuffer()) reference_audio = audio_path st.success(f"✅ 音频已上传: {audio_path}") # 推理模式 infer_mode = st.selectbox( "推理模式", options=["普通推理", "快速推理"], index=0 if config.indextts2.get("infer_mode", "普通推理") == "普通推理" else 1, help="普通推理质量更高但速度较慢,快速推理速度更快但质量略低" ) # 高级参数折叠面板 with st.expander("🔧 高级参数", expanded=False): col1, col2 = st.columns(2) with col1: temperature = st.slider( "采样温度 (Temperature)", min_value=0.1, max_value=2.0, value=float(config.indextts2.get("temperature", 1.0)), step=0.1, help="控制随机性,值越高输出越随机,值越低越确定" ) top_p = st.slider( "Top P", min_value=0.0, max_value=1.0, value=float(config.indextts2.get("top_p", 0.8)), step=0.05, help="nucleus 采样的概率阈值,值越小结果越确定" ) top_k = st.slider( "Top K", min_value=0, max_value=100, value=int(config.indextts2.get("top_k", 30)), step=5, help="top-k 采样的 k 值,0 表示不使用 top-k" ) with col2: num_beams = st.slider( "束搜索 (Num Beams)", min_value=1, max_value=10, value=int(config.indextts2.get("num_beams", 3)), step=1, help="束搜索的 beam 数量,值越大质量可能越好但速度越慢" ) repetition_penalty = st.slider( "重复惩罚 (Repetition Penalty)", min_value=1.0, max_value=20.0, value=float(config.indextts2.get("repetition_penalty", 10.0)), step=0.5, help="值越大越能避免重复,但过大可能导致不自然" ) do_sample = st.checkbox( "启用采样", value=config.indextts2.get("do_sample", True), help="启用采样可以获得更自然的语音" ) # 显示使用说明 with st.expander("💡 IndexTTS2 使用说明", expanded=False): st.markdown(""" **零样本语音克隆** 1. **准备参考音频**:上传或指定一段清晰的音频文件(建议 3-10 秒) 2. **设置 API 地址**:确保 IndexTTS2 服务正常运行 3. **开始合成**:系统会自动使用参考音频的音色合成新语音 **注意事项**: - 参考音频质量直接影响合成效果 - 建议使用无背景噪音的清晰音频 - 文本长度建议控制在合理范围内 - 首次合成可能需要较长时间 """) # 保存配置 config.indextts2["api_url"] = api_url config.indextts2["reference_audio"] = reference_audio config.indextts2["infer_mode"] = infer_mode config.indextts2["temperature"] = temperature config.indextts2["top_p"] = top_p config.indextts2["top_k"] = top_k config.indextts2["num_beams"] = num_beams config.indextts2["repetition_penalty"] = repetition_penalty config.indextts2["do_sample"] = do_sample # 保存 voice_name 用于兼容性 if reference_audio: config.ui["voice_name"] = f"indextts2:{reference_audio}" def render_voice_preview_new(tr, selected_engine): """渲染新的语音试听功能""" if st.button("🎵 试听语音合成", use_container_width=True): play_content = "感谢关注 NarratoAI,有任何问题或建议,可以关注微信公众号,求助或讨论" # 根据选择的引擎获取对应的语音配置 voice_name = "" voice_rate = 1.0 voice_pitch = 1.0 if selected_engine == "edge_tts": voice_name = config.ui.get("edge_voice_name", "zh-CN-XiaoyiNeural-Female") voice_rate = config.ui.get("edge_rate", 1.0) voice_pitch = 1.0 + (config.ui.get("edge_pitch", 0) / 100.0) elif selected_engine == "azure_speech": voice_name = config.ui.get("azure_voice_name", "zh-CN-XiaoxiaoMultilingualNeural") voice_rate = config.ui.get("azure_rate", 1.0) voice_pitch = 1.0 + (config.ui.get("azure_pitch", 0) / 100.0) elif selected_engine == "soulvoice": voice_uri = config.soulvoice.get("voice_uri", "") if voice_uri: if not voice_uri.startswith("soulvoice:") and not voice_uri.startswith("speech:"): voice_name = f"soulvoice:{voice_uri}" else: voice_name = voice_uri if voice_uri.startswith("soulvoice:") else f"soulvoice:{voice_uri}" voice_rate = 1.0 # SoulVoice 使用默认语速 voice_pitch = 1.0 # SoulVoice 不支持音调调节 elif selected_engine == "tencent_tts": voice_type = config.ui.get("tencent_voice_type", "101001") voice_name = f"tencent:{voice_type}" voice_rate = config.ui.get("tencent_rate", 1.0) voice_pitch = 1.0 # 腾讯云 TTS 不支持音调调节 elif selected_engine == "qwen3_tts": vt = config.ui.get("qwen_voice_type", "Cherry") voice_name = f"qwen3:{vt}" voice_rate = config.ui.get("qwen3_rate", 1.0) voice_pitch = 1.0 # Qwen3 TTS 不支持音调调节 elif selected_engine == "indextts2": reference_audio = config.indextts2.get("reference_audio", "") if reference_audio: voice_name = f"indextts2:{reference_audio}" voice_rate = 1.0 # IndexTTS2 不支持速度调节 voice_pitch = 1.0 # IndexTTS2 不支持音调调节 if not voice_name: st.error("请先配置语音设置") return with st.spinner("正在合成语音..."): temp_dir = utils.storage_dir("temp", create=True) audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") sub_maker = voice.tts( text=play_content, voice_name=voice_name, voice_rate=voice_rate, voice_pitch=voice_pitch, voice_file=audio_file, tts_engine=st.session_state.get('tts_engine') ) if sub_maker and os.path.exists(audio_file): st.success("✅ 语音合成成功!") # 播放音频 with open(audio_file, 'rb') as audio_file_obj: audio_bytes = audio_file_obj.read() st.audio(audio_bytes, format='audio/mp3') # 清理临时文件 try: os.remove(audio_file) except: pass else: st.error("❌ 语音合成失败,请检查配置") def render_azure_v2_settings(tr): """渲染Azure V2语音设置(保留兼容性)""" saved_azure_speech_region = config.azure.get("speech_region", "") saved_azure_speech_key = config.azure.get("speech_key", "") azure_speech_region = st.text_input( tr("Speech Region"), value=saved_azure_speech_region ) azure_speech_key = st.text_input( tr("Speech Key"), value=saved_azure_speech_key, type="password" ) config.azure["speech_region"] = azure_speech_region config.azure["speech_key"] = azure_speech_key def render_voice_parameters(tr, voice_name): """渲染语音参数设置(保留兼容性)""" # 音量 - 使用统一的默认值 voice_volume = st.slider( tr("Speech Volume"), min_value=AudioVolumeDefaults.MIN_VOLUME, max_value=AudioVolumeDefaults.MAX_VOLUME, value=AudioVolumeDefaults.VOICE_VOLUME, step=0.01, help=tr("Adjust the volume of the original audio") ) st.session_state['voice_volume'] = voice_volume # 检查是否为 SoulVoice 引擎 is_soulvoice = voice.is_soulvoice_voice(voice_name) # 语速 if is_soulvoice: # SoulVoice 支持更精细的语速控制 voice_rate = st.slider( tr("Speech Rate"), min_value=0.5, max_value=2.0, value=1.0, step=0.1, help="SoulVoice 语音速度控制" ) else: # Azure TTS 使用预设选项 voice_rate = st.selectbox( tr("Speech Rate"), options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0], index=2, ) st.session_state['voice_rate'] = voice_rate # 音调 - SoulVoice 不支持音调调节 if not is_soulvoice: voice_pitch = st.selectbox( tr("Speech Pitch"), options=[0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0], index=2, ) st.session_state['voice_pitch'] = voice_pitch else: # SoulVoice 不支持音调调节,设置默认值 st.session_state['voice_pitch'] = 1.0 st.info("ℹ️ SoulVoice 引擎不支持音调调节") def render_voice_preview(tr, voice_name): """渲染语音试听功能""" if st.button(tr("Play Voice")): play_content = "感谢关注 NarratoAI,有任何问题或建议,可以关注微信公众号,求助或讨论" if not play_content: play_content = st.session_state.get('video_script', '') if not play_content: play_content = tr("Voice Example") with st.spinner(tr("Synthesizing Voice")): temp_dir = utils.storage_dir("temp", create=True) audio_file = os.path.join(temp_dir, f"tmp-voice-{str(uuid4())}.mp3") sub_maker = voice.tts( text=play_content, voice_name=voice_name, voice_rate=st.session_state.get('voice_rate', 1.0), voice_pitch=st.session_state.get('voice_pitch', 1.0), voice_file=audio_file, ) # 如果语音文件生成失败,使用默认内容重试 if not sub_maker: play_content = "This is a example voice. if you hear this, the voice synthesis failed with the original content." sub_maker = voice.tts( text=play_content, voice_name=voice_name, voice_rate=st.session_state.get('voice_rate', 1.0), voice_pitch=st.session_state.get('voice_pitch', 1.0), voice_file=audio_file, ) if sub_maker and os.path.exists(audio_file): st.success(tr("Voice synthesis successful")) st.audio(audio_file, format="audio/mp3") if os.path.exists(audio_file): os.remove(audio_file) else: st.error(tr("Voice synthesis failed")) def render_bgm_settings(tr): """渲染背景音乐设置""" # 背景音乐选项 bgm_options = [ (tr("No Background Music"), ""), (tr("Random Background Music"), "random"), (tr("Custom Background Music"), "custom"), ] selected_index = st.selectbox( tr("Background Music"), index=1, options=range(len(bgm_options)), format_func=lambda x: bgm_options[x][0], ) # 获取选择的背景音乐类型 bgm_type = bgm_options[selected_index][1] st.session_state['bgm_type'] = bgm_type # 自定义背景音乐处理 if bgm_type == "custom": custom_bgm_file = st.text_input(tr("Custom Background Music File")) if custom_bgm_file and os.path.exists(custom_bgm_file): st.session_state['bgm_file'] = custom_bgm_file # 背景音乐音量 - 使用统一的默认值 bgm_volume = st.slider( tr("Background Music Volume"), min_value=AudioVolumeDefaults.MIN_VOLUME, max_value=AudioVolumeDefaults.MAX_VOLUME, value=AudioVolumeDefaults.BGM_VOLUME, step=0.01, help=tr("Adjust the volume of the original audio") ) st.session_state['bgm_volume'] = bgm_volume def get_audio_params(): """获取音频参数""" return { 'voice_name': config.ui.get("voice_name", ""), 'voice_volume': st.session_state.get('voice_volume', AudioVolumeDefaults.VOICE_VOLUME), 'voice_rate': st.session_state.get('voice_rate', 1.0), 'voice_pitch': st.session_state.get('voice_pitch', 1.0), 'bgm_type': st.session_state.get('bgm_type', 'random'), 'bgm_file': st.session_state.get('bgm_file', ''), 'bgm_volume': st.session_state.get('bgm_volume', AudioVolumeDefaults.BGM_VOLUME), 'tts_engine': st.session_state.get('tts_engine', "edge_tts"), } ================================================ FILE: webui/components/basic_settings.py ================================================ import traceback import streamlit as st import os from app.config import config from app.utils import utils from loguru import logger from app.services.llm.unified_service import UnifiedLLMService # 需要用户手动填写 Base URL 的 OpenAI 兼容网关及其默认接口 OPENAI_COMPATIBLE_GATEWAY_BASE_URLS = { "siliconflow": "https://api.siliconflow.cn/v1", "openrouter": "https://openrouter.ai/api/v1", "moonshot": "https://api.moonshot.cn/v1", "gemini(openai)": "", } def build_base_url_help(provider: str, model_type: str) -> tuple[str, bool, str]: """ 根据 provider 返回 Base URL 的帮助文案 Returns: help_text: 显示在输入框的帮助内容 requires_base: 是否强制提示必须填写 Base URL placeholder: 推荐的默认值(可为空字符串) """ default_help = "自定义 API 端点(可选),当使用自建或第三方代理时需要填写" provider_key = (provider or "").lower() example_url = OPENAI_COMPATIBLE_GATEWAY_BASE_URLS.get(provider_key) if example_url is not None: extra = f"\n推荐接口地址: {example_url}" if example_url else "" help_text = ( f"{model_type} 选择的提供商基于 OpenAI 兼容网关,必须填写完整的接口地址。" f"{extra}" ) return help_text, True, example_url return default_help, False, "" def validate_api_key(api_key: str, provider: str) -> tuple[bool, str]: """验证API密钥格式""" if not api_key or not api_key.strip(): return False, f"{provider} API密钥不能为空" # 基本长度检查 if len(api_key.strip()) < 10: return False, f"{provider} API密钥长度过短,请检查是否正确" return True, "" def validate_base_url(base_url: str, provider: str) -> tuple[bool, str]: """验证Base URL格式""" if not base_url or not base_url.strip(): return True, "" # base_url可以为空 base_url = base_url.strip() if not (base_url.startswith('http://') or base_url.startswith('https://')): return False, f"{provider} Base URL必须以http://或https://开头" return True, "" def validate_model_name(model_name: str, provider: str) -> tuple[bool, str]: """验证模型名称""" if not model_name or not model_name.strip(): return False, f"{provider} 模型名称不能为空" return True, "" def validate_litellm_model_name(model_name: str, model_type: str) -> tuple[bool, str]: """验证 LiteLLM 模型名称格式 Args: model_name: 模型名称,应为 provider/model 格式 model_type: 模型类型(如"视频分析"、"文案生成") Returns: (是否有效, 错误消息) """ if not model_name or not model_name.strip(): return False, f"{model_type} 模型名称不能为空" model_name = model_name.strip() # LiteLLM 推荐格式:provider/model(如 gemini/gemini-2.0-flash-lite) # 但也支持直接的模型名称(如 gpt-4o,LiteLLM 会自动推断 provider) # 检查是否包含 provider 前缀(推荐格式) if "/" in model_name: parts = model_name.split("/") if len(parts) < 2 or not parts[0] or not parts[1]: return False, f"{model_type} 模型名称格式错误。推荐格式: provider/model (如 gemini/gemini-2.0-flash-lite)" # 验证 provider 名称(只允许字母、数字、下划线、连字符) provider = parts[0] if not provider.replace("-", "").replace("_", "").isalnum(): return False, f"{model_type} Provider 名称只能包含字母、数字、下划线和连字符" else: # 直接模型名称也是有效的(LiteLLM 会自动推断) # 但给出警告建议使用完整格式 logger.debug(f"{model_type} 模型名称未包含 provider 前缀,LiteLLM 将自动推断") # 基本长度检查 if len(model_name) < 3: return False, f"{model_type} 模型名称过短" if len(model_name) > 200: return False, f"{model_type} 模型名称过长" return True, "" def show_config_validation_errors(errors: list): """显示配置验证错误""" if errors: for error in errors: st.error(error) def render_basic_settings(tr): """渲染基础设置面板""" with st.expander(tr("Basic Settings"), expanded=False): config_panels = st.columns(3) left_config_panel = config_panels[0] middle_config_panel = config_panels[1] right_config_panel = config_panels[2] with left_config_panel: render_language_settings(tr) render_proxy_settings(tr) with middle_config_panel: render_vision_llm_settings(tr) # 视频分析模型设置 with right_config_panel: render_text_llm_settings(tr) # 文案生成模型设置 def render_language_settings(tr): st.subheader(tr("Proxy Settings")) """渲染语言设置""" system_locale = utils.get_system_locale() i18n_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "i18n") locales = utils.load_locales(i18n_dir) display_languages = [] selected_index = 0 for i, code in enumerate(locales.keys()): display_languages.append(f"{code} - {locales[code].get('Language')}") if code == st.session_state.get('ui_language', system_locale): selected_index = i selected_language = st.selectbox( tr("Language"), options=display_languages, index=selected_index ) if selected_language: code = selected_language.split(" - ")[0].strip() st.session_state['ui_language'] = code config.ui['language'] = code def render_proxy_settings(tr): """渲染代理设置""" # 获取当前代理状态 proxy_enabled = config.proxy.get("enabled", False) proxy_url_http = config.proxy.get("http") proxy_url_https = config.proxy.get("https") # 添加代理开关 proxy_enabled = st.checkbox(tr("Enable Proxy"), value=proxy_enabled) # 保存代理开关状态 config.proxy["enabled"] = proxy_enabled # 只有在代理启用时才显示代理设置输入框 if proxy_enabled: HTTP_PROXY = st.text_input(tr("HTTP_PROXY"), value=proxy_url_http) HTTPS_PROXY = st.text_input(tr("HTTPs_PROXY"), value=proxy_url_https) if HTTP_PROXY and HTTPS_PROXY: config.proxy["http"] = HTTP_PROXY config.proxy["https"] = HTTPS_PROXY os.environ["HTTP_PROXY"] = HTTP_PROXY os.environ["HTTPS_PROXY"] = HTTPS_PROXY # logger.debug(f"代理已启用: {HTTP_PROXY}") else: # 当代理被禁用时,清除环境变量和配置 os.environ.pop("HTTP_PROXY", None) os.environ.pop("HTTPS_PROXY", None) config.proxy["http"] = "" config.proxy["https"] = "" def test_vision_model_connection(api_key, base_url, model_name, provider, tr): """测试视觉模型连接 Args: api_key: API密钥 base_url: 基础URL model_name: 模型名称 provider: 提供商名称 Returns: bool: 连接是否成功 str: 测试结果消息 """ import requests logger.debug(f"大模型连通性测试: {base_url} 模型: {model_name} apikey: {api_key}") if provider.lower() == 'gemini': # 原生Gemini API测试 try: # 构建请求数据 request_data = { "contents": [{ "parts": [{"text": "直接回复我文本'当前网络可用'"}] }] } # 构建请求URL api_base_url = base_url url = f"{api_base_url}/models/{model_name}:generateContent" # 发送请求 response = requests.post( url, json=request_data, headers={ "x-goog-api-key": api_key, "Content-Type": "application/json" }, timeout=10 ) if response.status_code == 200: return True, tr("原生Gemini模型连接成功") else: return False, f"{tr('原生Gemini模型连接失败')}: HTTP {response.status_code}" except Exception as e: return False, f"{tr('原生Gemini模型连接失败')}: {str(e)}" elif provider.lower() == 'gemini(openai)': # OpenAI兼容的Gemini代理测试 try: headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } test_url = f"{base_url.rstrip('/')}/chat/completions" test_data = { "model": model_name, "messages": [ {"role": "user", "content": "直接回复我文本'当前网络可用'"} ], "stream": False } response = requests.post(test_url, headers=headers, json=test_data, timeout=10) if response.status_code == 200: return True, tr("OpenAI兼容Gemini代理连接成功") else: return False, f"{tr('OpenAI兼容Gemini代理连接失败')}: HTTP {response.status_code}" except Exception as e: return False, f"{tr('OpenAI兼容Gemini代理连接失败')}: {str(e)}" else: from openai import OpenAI try: client = OpenAI( api_key=api_key, base_url=base_url, ) response = client.chat.completions.create( model=model_name, messages=[ { "role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}], }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241022/emyrja/dog_and_girl.jpeg" }, }, {"type": "text", "text": "回复我网络可用即可"}, ], }, ], ) if response and response.choices: return True, tr("QwenVL model is available") else: return False, tr("QwenVL model returned invalid response") except Exception as e: # logger.debug(api_key) # logger.debug(base_url) # logger.debug(model_name) return False, f"{tr('QwenVL model is not available')}: {str(e)}" def test_litellm_vision_model(api_key: str, base_url: str, model_name: str, tr) -> tuple[bool, str]: """测试 LiteLLM 视觉模型连接 Args: api_key: API 密钥 base_url: 基础 URL(可选) model_name: 模型名称(LiteLLM 格式:provider/model) tr: 翻译函数 Returns: (连接是否成功, 测试结果消息) """ try: import litellm import os import base64 import io from PIL import Image logger.debug(f"LiteLLM 视觉模型连通性测试: model={model_name}, api_key={api_key[:10]}..., base_url={base_url}") # 提取 provider 名称 provider = model_name.split("/")[0] if "/" in model_name else "unknown" # 设置 API key 到环境变量 env_key_mapping = { "gemini": "GEMINI_API_KEY", "google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "qwen": "QWEN_API_KEY", "dashscope": "DASHSCOPE_API_KEY", "siliconflow": "SILICONFLOW_API_KEY", } env_var = env_key_mapping.get(provider.lower(), f"{provider.upper()}_API_KEY") old_key = os.environ.get(env_var) os.environ[env_var] = api_key # SiliconFlow 特殊处理:使用 OpenAI 兼容模式 test_model_name = model_name if provider.lower() == "siliconflow": # 替换 provider 为 openai if "/" in model_name: test_model_name = f"openai/{model_name.split('/', 1)[1]}" else: test_model_name = f"openai/{model_name}" # 确保设置了 base_url if not base_url: base_url = "https://api.siliconflow.cn/v1" # 设置 OPENAI_API_KEY (SiliconFlow 使用 OpenAI 协议) os.environ["OPENAI_API_KEY"] = api_key os.environ["OPENAI_API_BASE"] = base_url try: # 创建测试图片(64x64 白色像素,避免某些模型对极小图片的限制) test_image = Image.new('RGB', (64, 64), color='white') img_buffer = io.BytesIO() test_image.save(img_buffer, format='JPEG') img_bytes = img_buffer.getvalue() base64_image = base64.b64encode(img_bytes).decode('utf-8') # 构建测试请求 messages = [{ "role": "user", "content": [ {"type": "text", "text": "请直接回复'连接成功'"}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" } } ] }] # 准备参数 completion_kwargs = { "model": test_model_name, "messages": messages, "temperature": 0.1, "max_tokens": 50 } if base_url: completion_kwargs["api_base"] = base_url # 调用 LiteLLM(同步调用用于测试) response = litellm.completion(**completion_kwargs) if response and response.choices and len(response.choices) > 0: return True, f"LiteLLM 视觉模型连接成功 ({model_name})" else: return False, f"LiteLLM 视觉模型返回空响应" finally: # 恢复原始环境变量 if old_key: os.environ[env_var] = old_key else: os.environ.pop(env_var, None) # 清理临时设置的 OpenAI 环境变量 if provider.lower() == "siliconflow": os.environ.pop("OPENAI_API_KEY", None) os.environ.pop("OPENAI_API_BASE", None) except Exception as e: error_msg = str(e) logger.error(f"LiteLLM 视觉模型测试失败: {error_msg}") # 提供更友好的错误信息 if "authentication" in error_msg.lower() or "api_key" in error_msg.lower(): return False, f"认证失败,请检查 API Key 是否正确" elif "not found" in error_msg.lower() or "404" in error_msg: return False, f"模型不存在,请检查模型名称是否正确" elif "rate limit" in error_msg.lower(): return False, f"超出速率限制,请稍后重试" else: return False, f"连接失败: {error_msg}" def test_litellm_text_model(api_key: str, base_url: str, model_name: str, tr) -> tuple[bool, str]: """测试 LiteLLM 文本模型连接 Args: api_key: API 密钥 base_url: 基础 URL(可选) model_name: 模型名称(LiteLLM 格式:provider/model) tr: 翻译函数 Returns: (连接是否成功, 测试结果消息) """ try: import litellm import os logger.debug(f"LiteLLM 文本模型连通性测试: model={model_name}, api_key={api_key[:10]}..., base_url={base_url}") # 提取 provider 名称 provider = model_name.split("/")[0] if "/" in model_name else "unknown" # 设置 API key 到环境变量 env_key_mapping = { "gemini": "GEMINI_API_KEY", "google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "qwen": "QWEN_API_KEY", "dashscope": "DASHSCOPE_API_KEY", "siliconflow": "SILICONFLOW_API_KEY", "deepseek": "DEEPSEEK_API_KEY", "moonshot": "MOONSHOT_API_KEY", } env_var = env_key_mapping.get(provider.lower(), f"{provider.upper()}_API_KEY") old_key = os.environ.get(env_var) os.environ[env_var] = api_key # SiliconFlow 特殊处理:使用 OpenAI 兼容模式 test_model_name = model_name if provider.lower() == "siliconflow": # 替换 provider 为 openai if "/" in model_name: test_model_name = f"openai/{model_name.split('/', 1)[1]}" else: test_model_name = f"openai/{model_name}" # 确保设置了 base_url if not base_url: base_url = "https://api.siliconflow.cn/v1" # 设置 OPENAI_API_KEY (SiliconFlow 使用 OpenAI 协议) os.environ["OPENAI_API_KEY"] = api_key os.environ["OPENAI_API_BASE"] = base_url try: # 构建测试请求 messages = [ {"role": "user", "content": "请直接回复'连接成功'"} ] # 准备参数 completion_kwargs = { "model": test_model_name, "messages": messages, "temperature": 0.1, "max_tokens": 20 } if base_url: completion_kwargs["api_base"] = base_url # 调用 LiteLLM(同步调用用于测试) response = litellm.completion(**completion_kwargs) if response and response.choices and len(response.choices) > 0: return True, f"LiteLLM 文本模型连接成功 ({model_name})" else: return False, f"LiteLLM 文本模型返回空响应" finally: # 恢复原始环境变量 if old_key: os.environ[env_var] = old_key else: os.environ.pop(env_var, None) # 清理临时设置的 OpenAI 环境变量 if provider.lower() == "siliconflow": os.environ.pop("OPENAI_API_KEY", None) os.environ.pop("OPENAI_API_BASE", None) except Exception as e: error_msg = str(e) logger.error(f"LiteLLM 文本模型测试失败: {error_msg}") # 提供更友好的错误信息 if "authentication" in error_msg.lower() or "api_key" in error_msg.lower(): return False, f"认证失败,请检查 API Key 是否正确" elif "not found" in error_msg.lower() or "404" in error_msg: return False, f"模型不存在,请检查模型名称是否正确" elif "rate limit" in error_msg.lower(): return False, f"超出速率限制,请稍后重试" else: return False, f"连接失败: {error_msg}" def render_vision_llm_settings(tr): """渲染视频分析模型设置(LiteLLM 统一配置)""" st.subheader(tr("Vision Model Settings")) # 固定使用 LiteLLM 提供商 config.app["vision_llm_provider"] = "litellm" # 获取已保存的 LiteLLM 配置 full_vision_model_name = config.app.get("vision_litellm_model_name", "gemini/gemini-2.0-flash-lite") vision_api_key = config.app.get("vision_litellm_api_key", "") vision_base_url = config.app.get("vision_litellm_base_url", "") # 解析 provider 和 model default_provider = "gemini" default_model = "gemini-2.0-flash-lite" if "/" in full_vision_model_name: parts = full_vision_model_name.split("/", 1) current_provider = parts[0] current_model = parts[1] else: current_provider = default_provider current_model = full_vision_model_name # 定义支持的 provider 列表 LITELLM_PROVIDERS = [ "openai", "gemini", "deepseek", "qwen", "siliconflow", "moonshot", "anthropic", "azure", "ollama", "vertex_ai", "mistral", "codestral", "volcengine", "groq", "cohere", "together_ai", "fireworks_ai", "openrouter", "replicate", "huggingface", "xai", "deepgram", "vllm", "bedrock", "cloudflare" ] # 如果当前 provider 不在列表中,添加到列表头部 if current_provider not in LITELLM_PROVIDERS: LITELLM_PROVIDERS.insert(0, current_provider) # 渲染配置输入框 col1, col2 = st.columns([1, 2]) with col1: selected_provider = st.selectbox( tr("Vision Model Provider"), options=LITELLM_PROVIDERS, index=LITELLM_PROVIDERS.index(current_provider) if current_provider in LITELLM_PROVIDERS else 0, key="vision_provider_select" ) with col2: model_name_input = st.text_input( tr("Vision Model Name"), value=current_model, help="输入模型名称(不包含 provider 前缀)\n\n" "常用示例:\n" "• gemini-2.0-flash-lite\n" "• gpt-4o\n" "• qwen-vl-max\n" "• Qwen/Qwen2.5-VL-32B-Instruct (SiliconFlow)\n\n" "支持 100+ providers,详见: https://docs.litellm.ai/docs/providers", key="vision_model_input" ) # 组合完整的模型名称 st_vision_model_name = f"{selected_provider}/{model_name_input}" if selected_provider and model_name_input else "" st_vision_api_key = st.text_input( tr("Vision API Key"), value=vision_api_key, type="password", help="对应 provider 的 API 密钥\n\n" "获取地址:\n" "• Gemini: https://makersuite.google.com/app/apikey\n" "• OpenAI: https://platform.openai.com/api-keys\n" "• Qwen: https://bailian.console.aliyun.com/\n" "• SiliconFlow: https://cloud.siliconflow.cn/account/ak" ) vision_base_help, vision_base_required, vision_placeholder = build_base_url_help( selected_provider, "视频分析模型" ) st_vision_base_url = st.text_input( tr("Vision Base URL"), value=vision_base_url, help=vision_base_help, placeholder=vision_placeholder or None ) if vision_base_required and not st_vision_base_url: info_example = vision_placeholder or "https://your-openai-compatible-endpoint/v1" st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}") # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_vision_connection"): test_errors = [] if not st_vision_api_key: test_errors.append("请先输入 API 密钥") if not model_name_input: test_errors.append("请先输入模型名称") if test_errors: for error in test_errors: st.error(error) else: with st.spinner(tr("Testing connection...")): try: success, message = test_litellm_vision_model( api_key=st_vision_api_key, base_url=st_vision_base_url, model_name=st_vision_model_name, tr=tr ) if success: st.success(message) else: st.error(message) except Exception as e: st.error(f"测试连接时发生错误: {str(e)}") logger.error(f"LiteLLM 视频分析模型连接测试失败: {str(e)}") # 验证和保存配置 validation_errors = [] config_changed = False # 验证模型名称 if st_vision_model_name: # 这里的验证逻辑可能需要微调,因为我们现在是自动组合的 is_valid, error_msg = validate_litellm_model_name(st_vision_model_name, "视频分析") if is_valid: config.app["vision_litellm_model_name"] = st_vision_model_name st.session_state["vision_litellm_model_name"] = st_vision_model_name config_changed = True else: validation_errors.append(error_msg) # 验证 API 密钥 if st_vision_api_key: is_valid, error_msg = validate_api_key(st_vision_api_key, "视频分析") if is_valid: config.app["vision_litellm_api_key"] = st_vision_api_key st.session_state["vision_litellm_api_key"] = st_vision_api_key config_changed = True else: validation_errors.append(error_msg) # 验证 Base URL(可选) if st_vision_base_url: is_valid, error_msg = validate_base_url(st_vision_base_url, "视频分析") if is_valid: config.app["vision_litellm_base_url"] = st_vision_base_url st.session_state["vision_litellm_base_url"] = st_vision_base_url config_changed = True else: validation_errors.append(error_msg) # 显示验证错误 show_config_validation_errors(validation_errors) # 保存配置 if config_changed and not validation_errors: try: config.save_config() # 清除缓存,确保下次使用新配置 UnifiedLLMService.clear_cache() if st_vision_api_key or st_vision_base_url or st_vision_model_name: st.success(f"视频分析模型配置已保存(LiteLLM)") except Exception as e: st.error(f"保存配置失败: {str(e)}") logger.error(f"保存视频分析配置失败: {str(e)}") def test_text_model_connection(api_key, base_url, model_name, provider, tr): """测试文本模型连接 Args: api_key: API密钥 base_url: 基础URL model_name: 模型名称 provider: 提供商名称 Returns: bool: 连接是否成功 str: 测试结果消息 """ import requests logger.debug(f"大模型连通性测试: {base_url} 模型: {model_name} apikey: {api_key}") try: # 构建统一的测试请求(遵循OpenAI格式) headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } # 特殊处理Gemini if provider.lower() == 'gemini': # 原生Gemini API测试 try: # 构建请求数据 request_data = { "contents": [{ "parts": [{"text": "直接回复我文本'当前网络可用'"}] }] } # 构建请求URL api_base_url = base_url url = f"{api_base_url}/models/{model_name}:generateContent" # 发送请求 response = requests.post( url, json=request_data, headers={ "x-goog-api-key": api_key, "Content-Type": "application/json" }, timeout=10 ) if response.status_code == 200: return True, tr("原生Gemini模型连接成功") else: return False, f"{tr('原生Gemini模型连接失败')}: HTTP {response.status_code}" except Exception as e: return False, f"{tr('原生Gemini模型连接失败')}: {str(e)}" elif provider.lower() == 'gemini(openai)': # OpenAI兼容的Gemini代理测试 test_url = f"{base_url.rstrip('/')}/chat/completions" test_data = { "model": model_name, "messages": [ {"role": "user", "content": "直接回复我文本'当前网络可用'"} ], "stream": False } response = requests.post(test_url, headers=headers, json=test_data, timeout=10) if response.status_code == 200: return True, tr("OpenAI兼容Gemini代理连接成功") else: return False, f"{tr('OpenAI兼容Gemini代理连接失败')}: HTTP {response.status_code}" else: test_url = f"{base_url.rstrip('/')}/chat/completions" # 构建测试消息 test_data = { "model": model_name, "messages": [ {"role": "user", "content": "直接回复我文本'当前网络可用'"} ], "stream": False } # 发送测试请求 response = requests.post( test_url, headers=headers, json=test_data, ) # logger.debug(model_name) # logger.debug(api_key) # logger.debug(test_url) if response.status_code == 200: return True, tr("Text model is available") else: return False, f"{tr('Text model is not available')}: HTTP {response.status_code}" except Exception as e: logger.error(traceback.format_exc()) return False, f"{tr('Connection failed')}: {str(e)}" def render_text_llm_settings(tr): """渲染文案生成模型设置(LiteLLM 统一配置)""" st.subheader(tr("Text Generation Model Settings")) # 固定使用 LiteLLM 提供商 config.app["text_llm_provider"] = "litellm" # 获取已保存的 LiteLLM 配置 full_text_model_name = config.app.get("text_litellm_model_name", "deepseek/deepseek-chat") text_api_key = config.app.get("text_litellm_api_key", "") text_base_url = config.app.get("text_litellm_base_url", "") # 解析 provider 和 model default_provider = "deepseek" default_model = "deepseek-chat" if "/" in full_text_model_name: parts = full_text_model_name.split("/", 1) current_provider = parts[0] current_model = parts[1] else: current_provider = default_provider current_model = full_text_model_name # 定义支持的 provider 列表 LITELLM_PROVIDERS = [ "openai", "gemini", "deepseek", "qwen", "siliconflow", "moonshot", "anthropic", "azure", "ollama", "vertex_ai", "mistral", "codestral", "volcengine", "groq", "cohere", "together_ai", "fireworks_ai", "openrouter", "replicate", "huggingface", "xai", "deepgram", "vllm", "bedrock", "cloudflare" ] # 如果当前 provider 不在列表中,添加到列表头部 if current_provider not in LITELLM_PROVIDERS: LITELLM_PROVIDERS.insert(0, current_provider) # 渲染配置输入框 col1, col2 = st.columns([1, 2]) with col1: selected_provider = st.selectbox( tr("Text Model Provider"), options=LITELLM_PROVIDERS, index=LITELLM_PROVIDERS.index(current_provider) if current_provider in LITELLM_PROVIDERS else 0, key="text_provider_select" ) with col2: model_name_input = st.text_input( tr("Text Model Name"), value=current_model, help="输入模型名称(不包含 provider 前缀)\n\n" "常用示例:\n" "• deepseek-chat\n" "• gpt-4o\n" "• gemini-2.0-flash\n" "• deepseek-ai/DeepSeek-R1 (SiliconFlow)\n\n" "支持 100+ providers,详见: https://docs.litellm.ai/docs/providers", key="text_model_input" ) # 组合完整的模型名称 st_text_model_name = f"{selected_provider}/{model_name_input}" if selected_provider and model_name_input else "" st_text_api_key = st.text_input( tr("Text API Key"), value=text_api_key, type="password", help="对应 provider 的 API 密钥\n\n" "获取地址:\n" "• DeepSeek: https://platform.deepseek.com/api_keys\n" "• Gemini: https://makersuite.google.com/app/apikey\n" "• OpenAI: https://platform.openai.com/api-keys\n" "• Qwen: https://bailian.console.aliyun.com/\n" "• SiliconFlow: https://cloud.siliconflow.cn/account/ak\n" "• Moonshot: https://platform.moonshot.cn/console/api-keys" ) text_base_help, text_base_required, text_placeholder = build_base_url_help( selected_provider, "文案生成模型" ) st_text_base_url = st.text_input( tr("Text Base URL"), value=text_base_url, help=text_base_help, placeholder=text_placeholder or None ) if text_base_required and not st_text_base_url: info_example = text_placeholder or "https://your-openai-compatible-endpoint/v1" st.info(f"请在上方填写 OpenAI 兼容网关地址,例如:{info_example}") # 添加测试连接按钮 if st.button(tr("Test Connection"), key="test_text_connection"): test_errors = [] if not st_text_api_key: test_errors.append("请先输入 API 密钥") if not model_name_input: test_errors.append("请先输入模型名称") if test_errors: for error in test_errors: st.error(error) else: with st.spinner(tr("Testing connection...")): try: success, message = test_litellm_text_model( api_key=st_text_api_key, base_url=st_text_base_url, model_name=st_text_model_name, tr=tr ) if success: st.success(message) else: st.error(message) except Exception as e: st.error(f"测试连接时发生错误: {str(e)}") logger.error(f"LiteLLM 文案生成模型连接测试失败: {str(e)}") # 验证和保存配置 text_validation_errors = [] text_config_changed = False # 验证模型名称 if st_text_model_name: is_valid, error_msg = validate_litellm_model_name(st_text_model_name, "文案生成") if is_valid: config.app["text_litellm_model_name"] = st_text_model_name st.session_state["text_litellm_model_name"] = st_text_model_name text_config_changed = True else: text_validation_errors.append(error_msg) # 验证 API 密钥 if st_text_api_key: is_valid, error_msg = validate_api_key(st_text_api_key, "文案生成") if is_valid: config.app["text_litellm_api_key"] = st_text_api_key st.session_state["text_litellm_api_key"] = st_text_api_key text_config_changed = True else: text_validation_errors.append(error_msg) # 验证 Base URL(可选) if st_text_base_url: is_valid, error_msg = validate_base_url(st_text_base_url, "文案生成") if is_valid: config.app["text_litellm_base_url"] = st_text_base_url st.session_state["text_litellm_base_url"] = st_text_base_url text_config_changed = True else: text_validation_errors.append(error_msg) # 显示验证错误 show_config_validation_errors(text_validation_errors) # 保存配置 if text_config_changed and not text_validation_errors: try: config.save_config() # 清除缓存,确保下次使用新配置 UnifiedLLMService.clear_cache() if st_text_api_key or st_text_base_url or st_text_model_name: st.success(f"文案生成模型配置已保存(LiteLLM)") except Exception as e: st.error(f"保存配置失败: {str(e)}") logger.error(f"保存文案生成配置失败: {str(e)}") # # Cloudflare 特殊配置 # if text_provider == 'cloudflare': # st_account_id = st.text_input( # tr("Account ID"), # value=config.app.get(f"text_{text_provider}_account_id", "") # ) # if st_account_id: # config.app[f"text_{text_provider}_account_id"] = st_account_id ================================================ FILE: webui/components/ffmpeg_diagnostics.py ================================================ """ FFmpeg 诊断和配置组件 为用户提供 FFmpeg 兼容性诊断和配置选项 """ import streamlit as st import platform from typing import Dict, Any from loguru import logger try: from app.utils import ffmpeg_utils from app.config.ffmpeg_config import FFmpegConfigManager except ImportError as e: logger.error(f"导入模块失败: {e}") ffmpeg_utils = None FFmpegConfigManager = None def show_ffmpeg_diagnostics(): """显示 FFmpeg 诊断信息""" st.subheader("🔧 FFmpeg 兼容性诊断") if ffmpeg_utils is None or FFmpegConfigManager is None: st.error("❌ 无法加载 FFmpeg 工具模块") return # 基础信息 col1, col2 = st.columns(2) with col1: st.write("**系统信息**") st.write(f"- 操作系统: {platform.system()} {platform.release()}") st.write(f"- 架构: {platform.machine()}") st.write(f"- Python: {platform.python_version()}") with col2: st.write("**FFmpeg 状态**") # 检查 FFmpeg 安装 if ffmpeg_utils.check_ffmpeg_installation(): st.success("✅ FFmpeg 已安装") else: st.error("❌ FFmpeg 未安装或不在 PATH 中") st.info("请安装 FFmpeg 并确保其在系统 PATH 中") return # 硬件加速信息 st.write("**硬件加速检测**") try: hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() if hwaccel_info.get("available", False): st.success(f"✅ {hwaccel_info.get('message', '硬件加速可用')}") # 显示详细信息 with st.expander("硬件加速详情"): st.write(f"- 加速类型: {hwaccel_info.get('type', '未知')}") st.write(f"- 编码器: {hwaccel_info.get('encoder', '未知')}") st.write(f"- GPU 厂商: {hwaccel_info.get('gpu_vendor', '未知')}") st.write(f"- 独立显卡: {'是' if hwaccel_info.get('is_dedicated_gpu', False) else '否'}") if hwaccel_info.get("tested_methods"): st.write(f"- 测试的方法: {', '.join(hwaccel_info['tested_methods'])}") else: st.warning(f"⚠️ {hwaccel_info.get('message', '硬件加速不可用')}") except Exception as e: st.error(f"❌ 硬件加速检测失败: {str(e)}") # 配置文件推荐 st.write("**推荐配置**") try: recommended_profile = FFmpegConfigManager.get_recommended_profile() profile = FFmpegConfigManager.get_profile(recommended_profile) st.info(f"🎯 推荐配置: **{profile.description}**") # 显示配置详情 with st.expander("配置详情"): st.write(f"- 配置名称: {profile.name}") st.write(f"- 硬件加速: {'启用' if profile.hwaccel_enabled else '禁用'}") st.write(f"- 编码器: {profile.encoder}") st.write(f"- 质量预设: {profile.quality_preset}") st.write(f"- 兼容性等级: {profile.compatibility_level}/5") except Exception as e: st.error(f"❌ 配置推荐失败: {str(e)}") # 兼容性报告 if st.button("🔍 生成详细兼容性报告"): try: report = FFmpegConfigManager.get_compatibility_report() st.write("**详细兼容性报告**") st.json(report) # 显示建议 if report.get("suggestions"): st.write("**优化建议**") for suggestion in report["suggestions"]: st.write(f"- {suggestion}") except Exception as e: st.error(f"❌ 生成报告失败: {str(e)}") def show_ffmpeg_settings(): """显示 FFmpeg 设置选项""" st.subheader("⚙️ FFmpeg 设置") if FFmpegConfigManager is None: st.error("❌ 无法加载配置管理器") return # 配置文件选择 profiles = FFmpegConfigManager.list_profiles() # 获取当前推荐配置 try: recommended_profile = FFmpegConfigManager.get_recommended_profile() except Exception: recommended_profile = "universal_software" # 配置文件选择器 selected_profile = st.selectbox( "选择 FFmpeg 配置文件", options=list(profiles.keys()), index=list(profiles.keys()).index(recommended_profile) if recommended_profile in profiles else 0, format_func=lambda x: f"{profiles[x]} {'(推荐)' if x == recommended_profile else ''}", help="不同的配置文件针对不同的硬件和兼容性需求进行了优化" ) # 显示选中配置的详情 if selected_profile: profile = FFmpegConfigManager.get_profile(selected_profile) st.write("**配置详情**") col1, col2 = st.columns(2) with col1: st.write(f"- 硬件加速: {'✅ 启用' if profile.hwaccel_enabled else '❌ 禁用'}") st.write(f"- 编码器: {profile.encoder}") st.write(f"- 质量预设: {profile.quality_preset}") with col2: st.write(f"- 像素格式: {profile.pixel_format}") st.write(f"- 兼容性等级: {profile.compatibility_level}/5") if profile.additional_args: st.write(f"- 额外参数: {' '.join(profile.additional_args)}") # 高级设置 with st.expander("🔧 高级设置"): st.write("**强制设置选项**") col1, col2 = st.columns(2) with col1: if st.button("🚫 强制禁用硬件加速"): try: ffmpeg_utils.force_software_encoding() st.success("✅ 已强制禁用硬件加速") st.info("这将使用纯软件编码,兼容性最高但性能较低") except Exception as e: st.error(f"❌ 操作失败: {str(e)}") with col2: if st.button("🔄 重置硬件加速检测"): try: ffmpeg_utils.reset_hwaccel_detection() st.success("✅ 已重置硬件加速检测") st.info("下次使用时将重新检测硬件加速能力") except Exception as e: st.error(f"❌ 操作失败: {str(e)}") # 测试按钮 st.write("**测试功能**") if st.button("🧪 测试 FFmpeg 兼容性"): with st.spinner("正在测试 FFmpeg 兼容性..."): try: # 这里可以调用测试脚本 st.info("请在终端运行 `python test_video_extraction.py ` 进行完整测试") # 简单的兼容性测试 if ffmpeg_utils and ffmpeg_utils.check_ffmpeg_installation(): hwaccel_info = ffmpeg_utils.get_ffmpeg_hwaccel_info() if hwaccel_info.get("available"): st.success("✅ 基础兼容性测试通过") else: st.warning("⚠️ 硬件加速不可用,但软件编码应该可以工作") else: st.error("❌ FFmpeg 不可用") except Exception as e: st.error(f"❌ 测试失败: {str(e)}") def show_troubleshooting_guide(): """显示故障排除指南""" st.subheader("🆘 故障排除指南") # 常见问题 st.write("**常见问题及解决方案**") with st.expander("❌ 关键帧提取失败 - 滤镜链错误"): st.write(""" **问题描述**: 出现 "Impossible to convert between the formats" 错误 **解决方案**: 1. 在设置中选择 "兼容性配置" 或 "Windows NVIDIA 优化配置" 2. 点击 "强制禁用硬件加速" 按钮 3. 重新尝试关键帧提取 4. 如果仍然失败,请更新显卡驱动程序 """) with st.expander("⚠️ 硬件加速不可用"): st.write(""" **可能原因**: - 显卡驱动程序过旧 - FFmpeg 版本不支持当前硬件 - 系统缺少必要的运行库 **解决方案**: 1. 更新显卡驱动程序到最新版本 2. 对于 NVIDIA 用户,安装 CUDA 工具包 3. 对于 AMD 用户,安装 AMD Media SDK 4. 使用软件编码作为备用方案 """) with st.expander("🐌 处理速度很慢"): st.write(""" **优化建议**: 1. 启用硬件加速(如果可用) 2. 选择 "高性能配置" 3. 降低视频质量设置 4. 增加关键帧提取间隔 5. 关闭其他占用 GPU 的程序 """) with st.expander("📁 文件权限问题"): st.write(""" **解决方案**: 1. 确保对输出目录有写入权限 2. 以管理员身份运行程序(Windows) 3. 检查磁盘空间是否充足 4. 避免使用包含特殊字符的文件路径 """) # 联系支持 st.write("**需要更多帮助?**") st.info(""" 如果上述解决方案都无法解决您的问题,请: 1. 运行 `python test_video_extraction.py` 生成详细的测试报告 2. 记录具体的错误信息和系统环境 3. 联系技术支持并提供相关信息 """) def render_ffmpeg_diagnostics_page(): """渲染 FFmpeg 诊断页面""" st.title("🔧 FFmpeg 诊断与配置") # 选项卡 tab1, tab2, tab3 = st.tabs(["🔍 诊断信息", "⚙️ 配置设置", "🆘 故障排除"]) with tab1: show_ffmpeg_diagnostics() with tab2: show_ffmpeg_settings() with tab3: show_troubleshooting_guide() if __name__ == "__main__": render_ffmpeg_diagnostics_page() ================================================ FILE: webui/components/script_settings.py ================================================ import os import glob import json import time import traceback import streamlit as st from loguru import logger from app.config import config from app.models.schema import VideoClipParams from app.services.subtitle_text import decode_subtitle_bytes from app.utils import utils, check_script from webui.tools.generate_script_docu import generate_script_docu from webui.tools.generate_script_short import generate_script_short from webui.tools.generate_short_summary import generate_script_short_sunmmary def render_script_panel(tr): """渲染脚本配置面板""" with st.container(border=True): st.write(tr("Video Script Configuration")) params = VideoClipParams() # 渲染脚本文件选择 render_script_file(tr, params) # 渲染视频文件选择 render_video_file(tr, params) # 获取当前选择的脚本类型 script_path = st.session_state.get('video_clip_json_path', '') # 根据脚本类型显示不同的布局 if script_path == "auto": # 画面解说 render_video_details(tr) elif script_path == "short": # 短剧混剪 render_short_generate_options(tr) elif script_path == "summary": # 短剧解说 short_drama_summary(tr) else: # 默认为空 pass # 渲染脚本操作按钮 render_script_buttons(tr, params) def render_script_file(tr, params): """渲染脚本文件选择""" # 定义功能模式 MODE_FILE = "file_selection" MODE_AUTO = "auto" MODE_SHORT = "short" MODE_SUMMARY = "summary" # 处理保存脚本后的模式切换(必须在 widget 实例化之前) if st.session_state.get('_switch_to_file_mode'): st.session_state['script_mode_selection'] = tr("Select/Upload Script") del st.session_state['_switch_to_file_mode'] # 模式选项映射 mode_options = { tr("Select/Upload Script"): MODE_FILE, tr("Auto Generate"): MODE_AUTO, tr("Short Generate"): MODE_SHORT, tr("Short Drama Summary"): MODE_SUMMARY, } # 获取当前状态 current_path = st.session_state.get('video_clip_json_path', '') # 确定当前选中的模式索引 default_index = 0 mode_keys = list(mode_options.keys()) if current_path == "auto": default_index = mode_keys.index(tr("Auto Generate")) elif current_path == "short": default_index = mode_keys.index(tr("Short Generate")) elif current_path == "summary": default_index = mode_keys.index(tr("Short Drama Summary")) else: default_index = mode_keys.index(tr("Select/Upload Script")) # 1. 渲染功能选择下拉框 # 使用 segmented_control 替代 selectbox,提供更好的视觉体验 default_mode_label = mode_keys[default_index] # 定义回调函数来处理状态更新 def update_script_mode(): # 获取当前选中的标签 selected_label = st.session_state.script_mode_selection if selected_label: # 更新实际的 path 状态 new_mode = mode_options[selected_label] st.session_state.video_clip_json_path = new_mode params.video_clip_json_path = new_mode else: # 如果用户取消选择(segmented_control 允许取消),恢复到默认或上一个状态 # 这里我们强制保持当前状态,或者重置为默认 st.session_state.script_mode_selection = default_mode_label # 渲染组件 selected_mode_label = st.segmented_control( tr("Video Type"), options=mode_keys, default=default_mode_label, key="script_mode_selection", on_change=update_script_mode ) # 处理未选择的情况(虽然有default,但在某些交互下可能为空) if not selected_mode_label: selected_mode_label = default_mode_label selected_mode = mode_options[selected_mode_label] # 2. 根据选择的模式处理逻辑 if selected_mode == MODE_FILE: # --- 文件选择模式 --- script_list = [ (tr("None"), ""), (tr("Upload Script"), "upload_script") ] # 获取已有脚本文件 suffix = "*.json" script_dir = utils.script_dir() files = glob.glob(os.path.join(script_dir, suffix)) file_list = [] for file in files: file_list.append({ "name": os.path.basename(file), "file": file, "ctime": os.path.getctime(file) }) file_list.sort(key=lambda x: x["ctime"], reverse=True) for file in file_list: display_name = file['file'].replace(config.root_dir, "") script_list.append((display_name, file['file'])) # 找到保存的脚本文件在列表中的索引 # 如果当前path是特殊值(auto/short/summary),则重置为空 saved_script_path = current_path if current_path not in [MODE_AUTO, MODE_SHORT, MODE_SUMMARY] else "" selected_index = 0 for i, (_, path) in enumerate(script_list): if path == saved_script_path: selected_index = i break # 如果找到了保存的脚本,同步更新 selectbox 的 key 状态 if saved_script_path and selected_index > 0: st.session_state['script_file_selection'] = selected_index selected_script_index = st.selectbox( tr("Script Files"), index=selected_index, options=range(len(script_list)), format_func=lambda x: script_list[x][0], key="script_file_selection" ) script_path = script_list[selected_script_index][1] # 只有当用户实际选择了脚本时才更新路径,避免覆盖已保存的路径 if script_path: st.session_state['video_clip_json_path'] = script_path params.video_clip_json_path = script_path elif saved_script_path: # 如果用户选择了 "None" 但之前有保存的脚本,保持原有路径 st.session_state['video_clip_json_path'] = saved_script_path params.video_clip_json_path = saved_script_path # 处理脚本上传 if script_path == "upload_script": uploaded_file = st.file_uploader( tr("Upload Script File"), type=["json"], accept_multiple_files=False, ) if uploaded_file is not None: try: # 读取上传的JSON内容并验证格式 script_content = uploaded_file.read().decode('utf-8') json_data = json.loads(script_content) # 保存到脚本目录 safe_filename = os.path.basename(uploaded_file.name) script_file_path = os.path.join(script_dir, safe_filename) file_name, file_extension = os.path.splitext(safe_filename) # 如果文件已存在,添加时间戳 if os.path.exists(script_file_path): timestamp = time.strftime("%Y%m%d%H%M%S") file_name_with_timestamp = f"{file_name}_{timestamp}" script_file_path = os.path.join(script_dir, file_name_with_timestamp + file_extension) # 写入文件 with open(script_file_path, "w", encoding='utf-8') as f: json.dump(json_data, f, ensure_ascii=False, indent=2) # 更新状态 st.success(tr("Script Uploaded Successfully")) st.session_state['video_clip_json_path'] = script_file_path params.video_clip_json_path = script_file_path time.sleep(1) st.rerun() except json.JSONDecodeError: st.error(tr("Invalid JSON format")) except Exception as e: st.error(f"{tr('Upload failed')}: {str(e)}") else: # --- 功能生成模式 --- st.session_state['video_clip_json_path'] = selected_mode params.video_clip_json_path = selected_mode def render_video_file(tr, params): """渲染视频文件选择""" video_list = [(tr("None"), ""), (tr("Upload Local Files"), "upload_local")] # 获取已有视频文件 for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]: video_files = glob.glob(os.path.join(utils.video_dir(), suffix)) for file in video_files: display_name = file.replace(config.root_dir, "") video_list.append((display_name, file)) selected_video_index = st.selectbox( tr("Video File"), index=0, options=range(len(video_list)), format_func=lambda x: video_list[x][0] ) video_path = video_list[selected_video_index][1] st.session_state['video_origin_path'] = video_path params.video_origin_path = video_path if video_path == "upload_local": uploaded_file = st.file_uploader( tr("Upload Local Files"), type=["mp4", "mov", "avi", "flv", "mkv"], accept_multiple_files=False, ) if uploaded_file is not None: safe_filename = os.path.basename(uploaded_file.name) video_file_path = os.path.join(utils.video_dir(), safe_filename) file_name, file_extension = os.path.splitext(safe_filename) if os.path.exists(video_file_path): timestamp = time.strftime("%Y%m%d%H%M%S") file_name_with_timestamp = f"{file_name}_{timestamp}" video_file_path = os.path.join(utils.video_dir(), file_name_with_timestamp + file_extension) with open(video_file_path, "wb") as f: f.write(uploaded_file.read()) st.success(tr("File Uploaded Successfully")) st.session_state['video_origin_path'] = video_file_path params.video_origin_path = video_file_path time.sleep(1) st.rerun() def render_short_generate_options(tr): """ 渲染Short Generate模式下的特殊选项 在Short Generate模式下,替换原有的输入框为自定义片段选项 """ short_drama_summary(tr) # 显示自定义片段数量选择器 custom_clips = st.number_input( tr("自定义片段"), min_value=1, max_value=20, value=st.session_state.get('custom_clips', 5), help=tr("设置需要生成的短视频片段数量"), key="custom_clips_input" ) st.session_state['custom_clips'] = custom_clips def render_video_details(tr): """画面解说 渲染视频主题和提示词""" video_theme = st.text_input(tr("Video Theme")) custom_prompt = st.text_area( tr("Generation Prompt"), value=st.session_state.get('video_plot', ''), help=tr("Custom prompt for LLM, leave empty to use default prompt"), height=180 ) # 非短视频模式下显示原有的三个输入框 input_cols = st.columns(2) with input_cols[0]: st.number_input( tr("Frame Interval (seconds)"), min_value=0, value=st.session_state.get('frame_interval_input', config.frames.get('frame_interval_input', 3)), help=tr("Frame Interval (seconds) (More keyframes consume more tokens)"), key="frame_interval_input" ) with input_cols[1]: st.number_input( tr("Batch Size"), min_value=0, value=st.session_state.get('vision_batch_size', config.frames.get('vision_batch_size', 10)), help=tr("Batch Size (More keyframes consume more tokens)"), key="vision_batch_size" ) st.session_state['video_theme'] = video_theme st.session_state['custom_prompt'] = custom_prompt return video_theme, custom_prompt def short_drama_summary(tr): """短剧解说 渲染视频主题和提示词""" # 检查是否已经处理过字幕文件 if 'subtitle_file_processed' not in st.session_state: st.session_state['subtitle_file_processed'] = False subtitle_file = st.file_uploader( tr("上传字幕文件"), type=["srt"], accept_multiple_files=False, key="subtitle_file_uploader" # 添加唯一key ) # 显示当前已上传的字幕文件路径 if 'subtitle_path' in st.session_state and st.session_state['subtitle_path']: st.info(f"已上传字幕: {os.path.basename(st.session_state['subtitle_path'])}") if st.button(tr("清除已上传字幕")): st.session_state['subtitle_path'] = None st.session_state['subtitle_content'] = None st.session_state['subtitle_file_processed'] = False st.rerun() # 只有当有文件上传且尚未处理时才执行处理逻辑 if subtitle_file is not None and not st.session_state['subtitle_file_processed']: try: # 清理文件名,防止路径污染和路径遍历攻击 safe_filename = os.path.basename(subtitle_file.name) decoded = decode_subtitle_bytes(subtitle_file.getvalue()) script_content = decoded.text detected_encoding = decoded.encoding if not script_content: st.error(tr("无法读取字幕文件,请检查文件编码(支持 UTF-8、UTF-16、GBK、GB2312)")) st.stop() # 验证字幕内容(简单检查) if len(script_content.strip()) < 10: st.warning(tr("字幕文件内容似乎为空,请检查文件")) # 保存到字幕目录 script_file_path = os.path.join(utils.subtitle_dir(), safe_filename) file_name, file_extension = os.path.splitext(safe_filename) # 如果文件已存在,添加时间戳 if os.path.exists(script_file_path): timestamp = time.strftime("%Y%m%d%H%M%S") file_name_with_timestamp = f"{file_name}_{timestamp}" script_file_path = os.path.join(utils.subtitle_dir(), file_name_with_timestamp + file_extension) # 直接写入SRT内容(统一使用 UTF-8) with open(script_file_path, "w", encoding='utf-8') as f: f.write(script_content) # 更新状态 st.success( f"{tr('字幕上传成功')} " f"(编码: {detected_encoding.upper()}, " f"大小: {len(script_content)} 字符)" ) st.session_state['subtitle_path'] = script_file_path st.session_state['subtitle_content'] = script_content st.session_state['subtitle_file_processed'] = True # 标记已处理 # 避免使用rerun,使用更新状态的方式 # st.rerun() except Exception as e: st.error(f"{tr('Upload failed')}: {str(e)}") # 名称输入框 video_theme = st.text_input(tr("短剧名称")) st.session_state['video_theme'] = video_theme # 数字输入框 temperature = st.slider("temperature", 0.0, 2.0, 0.7) st.session_state['temperature'] = temperature return video_theme def render_script_buttons(tr, params): """渲染脚本操作按钮""" # 获取当前选择的脚本类型 script_path = st.session_state.get('video_clip_json_path', '') # 生成/加载按钮 if script_path == "auto": button_name = tr("Generate Video Script") elif script_path == "short": button_name = tr("Generate Short Video Script") elif script_path == "summary": button_name = tr("生成短剧解说脚本") elif script_path.endswith("json"): button_name = tr("Load Video Script") else: button_name = tr("Please Select Script File") if st.button(button_name, key="script_action", disabled=not script_path): if script_path == "auto": # 执行纪录片视频脚本生成(视频无字幕无配音) generate_script_docu(params) elif script_path == "short": # 执行 短剧混剪 脚本生成 custom_clips = st.session_state.get('custom_clips') generate_script_short(tr, params, custom_clips) elif script_path == "summary": # 执行 短剧解说 脚本生成 subtitle_path = st.session_state.get('subtitle_path') video_theme = st.session_state.get('video_theme') temperature = st.session_state.get('temperature') generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature) else: load_script(tr, script_path) # 视频脚本编辑区 video_clip_json_details = st.text_area( tr("Video Script"), value=json.dumps(st.session_state.get('video_clip_json', []), indent=2, ensure_ascii=False), height=500 ) # 操作按钮行 - 合并格式检查和保存功能 if st.button(tr("Save Script"), key="save_script", use_container_width=True): save_script_with_validation(tr, video_clip_json_details) def load_script(tr, script_path): """加载脚本文件""" try: with open(script_path, 'r', encoding='utf-8') as f: script = f.read() script = utils.clean_model_output(script) st.session_state['video_clip_json'] = json.loads(script) st.success(tr("Script loaded successfully")) st.rerun() except Exception as e: logger.error(f"加载脚本文件时发生错误\n{traceback.format_exc()}") st.error(f"{tr('Failed to load script')}: {str(e)}") def save_script_with_validation(tr, video_clip_json_details): """保存视频脚本(包含格式验证)""" if not video_clip_json_details: st.error(tr("请输入视频脚本")) st.stop() # 第一步:格式验证 with st.spinner("正在验证脚本格式..."): try: result = check_script.check_format(video_clip_json_details) if not result.get('success'): # 格式验证失败,显示详细错误信息 error_message = result.get('message', '未知错误') error_details = result.get('details', '') st.error(f"**脚本格式验证失败**") st.error(f"**错误信息:** {error_message}") if error_details: st.error(f"**详细说明:** {error_details}") # 显示正确格式示例 st.info("**正确的脚本格式示例:**") example_script = [ { "_id": 1, "timestamp": "00:00:00,600-00:00:07,559", "picture": "工地上,蔡晓艳奋力救人,场面混乱", "narration": "灾后重建,工地上险象环生!泼辣女工蔡晓艳挺身而出,救人第一!", "OST": 0 }, { "_id": 2, "timestamp": "00:00:08,240-00:00:12,359", "picture": "领导视察,蔡晓艳不屑一顾", "narration": "播放原片4", "OST": 1 } ] st.code(json.dumps(example_script, ensure_ascii=False, indent=2), language='json') st.stop() except Exception as e: st.error(f"格式验证过程中发生错误: {str(e)}") st.stop() # 第二步:保存脚本 with st.spinner(tr("Save Script")): script_dir = utils.script_dir() timestamp = time.strftime("%Y-%m%d-%H%M%S") save_path = os.path.join(script_dir, f"{timestamp}.json") try: data = json.loads(video_clip_json_details) with open(save_path, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) st.session_state['video_clip_json'] = data st.session_state['video_clip_json_path'] = save_path # 标记需要切换到文件选择模式(在下次渲染前处理) st.session_state['_switch_to_file_mode'] = True # 更新配置 config.app["video_clip_json_path"] = save_path # 显示成功消息 st.success("✅ 脚本格式验证通过,保存成功!") # 强制重新加载页面更新选择框 time.sleep(0.5) # 给一点时间让用户看到成功消息 st.rerun() except Exception as err: st.error(f"{tr('Failed to save script')}: {str(err)}") st.stop() # crop_video函数已移除 - 现在使用统一裁剪策略,不再需要预裁剪步骤 def get_script_params(): """获取脚本参数""" return { 'video_language': st.session_state.get('video_language', ''), 'video_clip_json_path': st.session_state.get('video_clip_json_path', ''), 'video_origin_path': st.session_state.get('video_origin_path', ''), 'video_name': st.session_state.get('video_name', ''), 'video_plot': st.session_state.get('video_plot', '') } ================================================ FILE: webui/components/subtitle_settings.py ================================================ from loguru import logger import streamlit as st from app.config import config from webui.utils.cache import get_fonts_cache import os def render_subtitle_panel(tr): """渲染字幕设置面板""" with st.container(border=True): st.write(tr("Subtitle Settings")) st.info("💡 提示:目前仅 **edge-tts** 引擎支持自动生成字幕,其他 TTS 引擎暂不支持。") # 检查是否选择了 SoulVoice qwen3_tts引擎 from app.services import voice # current_voice = st.session_state.get('voice_name', '') tts_engine = config.ui.get('tts_engine', '') is_disabled_subtitle = is_disabled_subtitle_settings(tts_engine) if is_disabled_subtitle: # SoulVoice 引擎时显示禁用提示 st.warning(f"⚠️ {tts_engine}不支持精确字幕生成") st.info("💡 建议使用专业剪辑工具(如剪映、PR等)手动添加字幕") # 强制禁用字幕 st.session_state['subtitle_enabled'] = False # 显示禁用状态的复选框 st.checkbox( tr("Enable Subtitles"), value=False, disabled=True, help="SoulVoice 引擎不支持字幕生成,请使用其他 TTS 引擎" ) else: # 其他引擎正常显示字幕选项 enable_subtitles = st.checkbox(tr("Enable Subtitles"), value=True) st.session_state['subtitle_enabled'] = enable_subtitles if enable_subtitles: render_font_settings(tr) render_position_settings(tr) render_style_settings(tr) def render_font_settings(tr): """渲染字体设置""" # 获取字体列表 font_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "resource", "fonts") font_names = get_fonts_cache(font_dir) # 获取保存的字体设置 saved_font_name = config.ui.get("font_name", "") saved_font_name_index = 0 if saved_font_name in font_names: saved_font_name_index = font_names.index(saved_font_name) # 字体选择 font_name = st.selectbox( tr("Font"), options=font_names, index=saved_font_name_index ) config.ui["font_name"] = font_name st.session_state['font_name'] = font_name # 字体大小 和 字幕大小 font_cols = st.columns([0.3, 0.7]) with font_cols[0]: saved_text_fore_color = config.ui.get("text_fore_color", "#FFFFFF") text_fore_color = st.color_picker( tr("Font Color"), saved_text_fore_color ) config.ui["text_fore_color"] = text_fore_color st.session_state['text_fore_color'] = text_fore_color with font_cols[1]: saved_font_size = config.ui.get("font_size", 60) font_size = st.slider( tr("Font Size"), min_value=20, max_value=100, value=saved_font_size ) config.ui["font_size"] = font_size st.session_state['font_size'] = font_size def is_disabled_subtitle_settings(tts_engine:str)->bool: """是否禁用字幕设置""" return tts_engine=="soulvoice" or tts_engine=="qwen3_tts" def render_position_settings(tr): """渲染位置设置""" subtitle_positions = [ (tr("Top"), "top"), (tr("Center"), "center"), (tr("Bottom"), "bottom"), (tr("Custom"), "custom"), ] selected_index = st.selectbox( tr("Position"), index=2, options=range(len(subtitle_positions)), format_func=lambda x: subtitle_positions[x][0], ) subtitle_position = subtitle_positions[selected_index][1] st.session_state['subtitle_position'] = subtitle_position # 自定义位置处理 if subtitle_position == "custom": custom_position = st.text_input( tr("Custom Position (% from top)"), value="70.0" ) try: custom_position_value = float(custom_position) if custom_position_value < 0 or custom_position_value > 100: st.error(tr("Please enter a value between 0 and 100")) else: st.session_state['custom_position'] = custom_position_value except ValueError: st.error(tr("Please enter a valid number")) def render_style_settings(tr): """渲染样式设置""" stroke_cols = st.columns([0.3, 0.7]) with stroke_cols[0]: stroke_color = st.color_picker( tr("Stroke Color"), value="#000000" ) st.session_state['stroke_color'] = stroke_color with stroke_cols[1]: stroke_width = st.slider( tr("Stroke Width"), min_value=0.0, max_value=10.0, value=1.0, step=0.01 ) st.session_state['stroke_width'] = stroke_width def get_subtitle_params(): """获取字幕参数""" font_name = st.session_state.get('font_name') or "SimHei" return { 'subtitle_enabled': st.session_state.get('subtitle_enabled', True), 'font_name': font_name, 'font_size': st.session_state.get('font_size', 60), 'text_fore_color': st.session_state.get('text_fore_color', '#FFFFFF'), 'subtitle_position': st.session_state.get('subtitle_position', 'bottom'), 'custom_position': st.session_state.get('custom_position', 70.0), 'stroke_color': st.session_state.get('stroke_color', '#000000'), 'stroke_width': st.session_state.get('stroke_width', 1.5), } ================================================ FILE: webui/components/system_settings.py ================================================ import streamlit as st import os import shutil from loguru import logger from app.utils.utils import storage_dir def clear_directory(dir_path, tr): """清理指定目录""" if os.path.exists(dir_path): try: for item in os.listdir(dir_path): item_path = os.path.join(dir_path, item) try: if os.path.isfile(item_path): os.unlink(item_path) elif os.path.isdir(item_path): shutil.rmtree(item_path) except Exception as e: logger.error(f"Failed to delete {item_path}: {e}") st.success(tr("Directory cleared")) logger.info(f"Cleared directory: {dir_path}") except Exception as e: st.error(f"{tr('Failed to clear directory')}: {str(e)}") logger.error(f"Failed to clear directory {dir_path}: {e}") else: st.warning(tr("Directory does not exist")) def render_system_panel(tr): """渲染系统设置面板""" with st.expander(tr("System settings"), expanded=False): col1, col2, col3 = st.columns(3) with col1: if st.button(tr("Clear frames"), use_container_width=True): clear_directory(os.path.join(storage_dir(), "temp/keyframes"), tr) with col2: if st.button(tr("Clear clip videos"), use_container_width=True): clear_directory(os.path.join(storage_dir(), "temp/clip_video"), tr) with col3: if st.button(tr("Clear tasks"), use_container_width=True): clear_directory(os.path.join(storage_dir(), "tasks"), tr) ================================================ FILE: webui/components/video_settings.py ================================================ import streamlit as st from app.models.schema import VideoClipParams, VideoAspect, AudioVolumeDefaults def render_video_panel(tr): """渲染视频配置面板""" with st.container(border=True): st.write(tr("Video Settings")) params = VideoClipParams() render_video_config(tr, params) def render_video_config(tr, params): """渲染视频配置""" # 视频比例 video_aspect_ratios = [ (tr("Portrait"), VideoAspect.portrait.value), (tr("Landscape"), VideoAspect.landscape.value), ] selected_index = st.selectbox( tr("Video Ratio"), options=range(len(video_aspect_ratios)), format_func=lambda x: video_aspect_ratios[x][0], ) params.video_aspect = VideoAspect(video_aspect_ratios[selected_index][1]) st.session_state['video_aspect'] = params.video_aspect.value # 视频画质 video_qualities = [ ("4K (2160p)", "2160p"), ("2K (1440p)", "1440p"), ("Full HD (1080p)", "1080p"), ("HD (720p)", "720p"), ("SD (480p)", "480p"), ] quality_index = st.selectbox( tr("Video Quality"), options=range(len(video_qualities)), format_func=lambda x: video_qualities[x][0], index=2 # 默认选择 1080p ) st.session_state['video_quality'] = video_qualities[quality_index][1] # 原声音量 - 使用统一的默认值 params.original_volume = st.slider( tr("Original Volume"), min_value=AudioVolumeDefaults.MIN_VOLUME, max_value=AudioVolumeDefaults.MAX_VOLUME, value=AudioVolumeDefaults.ORIGINAL_VOLUME, step=0.01, help=tr("Adjust the volume of the original audio") ) st.session_state['original_volume'] = params.original_volume def get_video_params(): """获取视频参数""" return { 'video_aspect': st.session_state.get('video_aspect', VideoAspect.portrait.value), 'video_quality': st.session_state.get('video_quality', '1080p'), 'original_volume': st.session_state.get('original_volume', AudioVolumeDefaults.ORIGINAL_VOLUME) } ================================================ FILE: webui/config/settings.py ================================================ import os import tomli from loguru import logger from typing import Dict, Any, Optional from dataclasses import dataclass def get_version_from_file(): """从project_version文件中读取版本号""" try: version_file = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "project_version" ) if os.path.isfile(version_file): with open(version_file, "r", encoding="utf-8") as f: return f.read().strip() return "0.1.0" # 默认版本号 except Exception as e: logger.error(f"读取版本号文件失败: {str(e)}") return "0.1.0" # 默认版本号 @dataclass class WebUIConfig: """WebUI配置类""" # UI配置 ui: Dict[str, Any] = None # 代理配置 proxy: Dict[str, str] = None # 应用配置 app: Dict[str, Any] = None # Azure配置 azure: Dict[str, str] = None # 项目版本 project_version: str = get_version_from_file() # 项目根目录 root_dir: str = None # Gemini API Key gemini_api_key: str = "" # 每批处理的图片数量 vision_batch_size: int = 5 # 提示词 vision_prompt: str = """...""" def __post_init__(self): """初始化默认值""" self.ui = self.ui or {} self.proxy = self.proxy or {} self.app = self.app or {} self.azure = self.azure or {} self.root_dir = self.root_dir or os.path.dirname(os.path.dirname(os.path.dirname(__file__))) def load_config(config_path: Optional[str] = None) -> WebUIConfig: """加载配置文件 Args: config_path: 配置文件路径,如果为None则使用默认路径 Returns: WebUIConfig: 配置对象 """ try: if config_path is None: config_path = os.path.join( os.path.dirname(os.path.dirname(__file__)), ".streamlit", "webui.toml" ) # 如果配置文件不存在,使用示例配置 if not os.path.exists(config_path): example_config = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "config.example.toml" ) if os.path.exists(example_config): config_path = example_config else: logger.warning(f"配置文件不存在: {config_path}") return WebUIConfig() # 读取配置文件 with open(config_path, "rb") as f: config_dict = tomli.load(f) # 创建配置对象,使用从文件读取的版本号 config = WebUIConfig( ui=config_dict.get("ui", {}), proxy=config_dict.get("proxy", {}), app=config_dict.get("app", {}), azure=config_dict.get("azure", {}), # 不再从配置文件中获取project_version ) return config except Exception as e: logger.error(f"加载配置文件失败: {e}") return WebUIConfig() def save_config(config: WebUIConfig, config_path: Optional[str] = None) -> bool: """保存配置到文件 Args: config: 配置对象 config_path: 配置文件路径,如果为None则使用默认路径 Returns: bool: 是否保存成功 """ try: if config_path is None: config_path = os.path.join( os.path.dirname(os.path.dirname(__file__)), ".streamlit", "webui.toml" ) # 确保目录存在 os.makedirs(os.path.dirname(config_path), exist_ok=True) # 转换为字典,不再保存版本号到配置文件 config_dict = { "ui": config.ui, "proxy": config.proxy, "app": config.app, "azure": config.azure # 不再保存project_version到配置文件 } # 保存配置 with open(config_path, "w", encoding="utf-8") as f: import tomli_w tomli_w.dump(config_dict, f) return True except Exception as e: logger.error(f"保存配置文件失败: {e}") return False def get_config() -> WebUIConfig: """获取全局配置对象 Returns: WebUIConfig: 配置对象 """ if not hasattr(get_config, "_config"): get_config._config = load_config() return get_config._config def update_config(config_dict: Dict[str, Any]) -> bool: """更新配置 Args: config_dict: 配置字典 Returns: bool: 是否更新成功 """ try: config = get_config() # 更新配置 if "ui" in config_dict: config.ui.update(config_dict["ui"]) if "proxy" in config_dict: config.proxy.update(config_dict["proxy"]) if "app" in config_dict: config.app.update(config_dict["app"]) if "azure" in config_dict: config.azure.update(config_dict["azure"]) # 不再从配置字典更新project_version # 保存配置 return save_config(config) except Exception as e: logger.error(f"更新配置失败: {e}") return False # 导出全局配置对象 config = get_config() ================================================ FILE: webui/i18n/__init__.py ================================================ # 空文件,用于标记包 ================================================ FILE: webui/i18n/en.json ================================================ { "Language": "English", "Translation": { "Video Script Configuration": "**Video Script Configuration**", "Video Script Generate": "Generate Video Script", "Video Subject": "Video Subject (Given a keyword, :red[AI auto-generates] video script)", "Script Language": "Language of the generated video script (Usually, AI automatically outputs according to the language of the input subject)", "Script Files": "Script Files", "Generate Video Script and Keywords": "Click to use AI to generate **Video Script** and **Video Keywords** based on the **subject**", "Auto Detect": "Auto Detect", "Auto Generate": "Auto Generate", "Video Script": "Video Script (:blue[①Optional, use AI to generate ②Proper punctuation helps in generating subtitles])", "Save Script": "Save Script", "Crop Video": "Crop Video", "Video File": "Video File (:blue[1️⃣Supports uploading video files (limit 2G) 2️⃣For large files, it is recommended to directly import them into the ./resource/videos directory])", "Plot Description": "Plot Description (:blue[Can be obtained from https://www.tvmao.com/])", "Generate Video Keywords": "Click to use AI to generate **Video Keywords** based on the **script**", "Please Enter the Video Subject": "Please enter the video script first", "Generating Video Script and Keywords": "AI is generating the video script and keywords...", "Generating Video Keywords": "AI is generating the video keywords...", "Video Keywords": "Video Keywords (:blue[Long videos work better in conjunction with plot descriptions.])", "Video Settings": "**Video Settings**", "Video Concat Mode": "Video Concatenation Mode", "Random": "Random Concatenation (Recommended)", "Sequential": "Sequential Concatenation", "Video Ratio": "Video Ratio", "Portrait": "Portrait 9:16 (TikTok Video)", "Landscape": "Landscape 16:9 (Xigua Video)", "Clip Duration": "Maximum Clip Duration (Seconds) (**Not the total length of the video**, refers to the length of each **composite segment**)", "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously", "Audio Settings": "**Audio Settings**", "Speech Synthesis": "Speech Synthesis Voice (:red[**Keep consistent with the script language**. Note: V2 version performs better, but requires an API KEY; SoulVoice provides high-quality Chinese voices])", "Speech Region": "Service Region (:red[Required, [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "API Key (:red[Required, either Key 1 or Key 2 is acceptable [Click to Get](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "Speech Volume (1.0 represents 100%)", "Speech Rate": "Speech Rate (1.0 represents 1x speed)", "Male": "Male", "Female": "Female", "Background Music": "Background Music", "No Background Music": "No Background Music", "Random Background Music": "Random Background Music", "Custom Background Music": "Custom Background Music", "Custom Background Music File": "Please enter the file path of the custom background music", "Background Music Volume": "Background Music Volume (0.2 represents 20%, background sound should not be too loud)", "Subtitle Settings": "**Subtitle Settings**", "Enable Subtitles": "Enable Subtitles (If unchecked, the following settings will not take effect)", "Font": "Subtitle Font", "Position": "Subtitle Position", "Top": "Top", "Center": "Center", "Bottom": "Bottom (Recommended)", "Custom": "Custom Position (70, represents 70% from the top)", "Font Size": "Subtitle Size", "Font Color": "Subtitle Color", "Stroke Color": "Stroke Color", "Stroke Width": "Stroke Width", "Generate Video": "Generate Video", "Video Script and Subject Cannot Both Be Empty": "Video Subject and Video Script cannot both be empty", "Generating Video": "Generating video, please wait...", "Start Generating Video": "Start Generating Video", "Video Generation Completed": "Video Generation Completed", "Video Generation Failed": "Video Generation Failed", "You can download the generated video from the following links": "You can download the generated video from the following links", "Basic Settings": "**Basic Settings** (:blue[Click to expand])", "Language": "Interface Language", "Pexels API Key": "Pexels API Key ([Click to Get](https://www.pexels.com/api/)) :red[Recommended]", "Pixabay API Key": "Pixabay API Key ([Click to Get](https://pixabay.com/api/docs/#api_search_videos)) :red[Optional, if Pexels is unavailable, then choose Pixabay]", "LLM Provider": "LLM Provider", "API Key": "API Key (:red[Required, must be applied from the LLM provider's backend])", "Base Url": "Base Url (Optional)", "Account ID": "Account ID (Obtained from the URL of the Cloudflare dashboard)", "Model Name": "Model Name (:blue[Confirm the authorized model name from the LLM provider's backend])", "Please Enter the LLM API Key": "Please enter the **LLM API Key**", "Please Enter the Pexels API Key": "Please enter the **Pexels API Key**", "Please Enter the Pixabay API Key": "Please enter the **Pixabay API Key**", "Get Help": "One-stop AI video commentary + automated editing tool\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\nFor any questions or suggestions, you can join the **community channel** for help or discussion: https://github.com/linyqh/NarratoAI/wiki", "Video Source": "Video Source", "TikTok": "TikTok (Support is coming soon)", "Bilibili": "Bilibili (Support is coming soon)", "Xiaohongshu": "Xiaohongshu (Support is coming soon)", "Local file": "Local file", "Play Voice": "Play Synthesized Voice", "Voice Example": "This is a sample text for testing voice synthesis", "Synthesizing Voice": "Synthesizing voice, please wait...", "TTS Provider": "TTS Provider", "Hide Log": "Hide Log", "Upload Local Files": "Upload Local Files", "File Uploaded Successfully": "File Uploaded Successfully", "Frame Interval (seconds)": "Frame Interval (seconds) (More keyframes consume more tokens)" } } ================================================ FILE: webui/i18n/zh.json ================================================ { "Language": "简体中文", "Translation": { "Video Script Configuration": "**视频脚本配置**", "Generate Video Script": "AI生成画面解说脚本", "Video Subject": "视频主题(给定一个关键词,:red[AI自动生成]视频文案)", "Script Language": "生成视频脚本的语言(一般情况AI会自动根据你输入的主题语言输出)", "Script Files": "脚本文件", "Generate Video Script and Keywords": "点击使用AI根据**主题**生成 【视频文案】 和 【视频关键词】", "Auto Detect": "自动检测", "Video Theme": "视频主题", "Generation Prompt": "自定义提示词", "Save Script": "保存脚本", "Video File": "视频文件(:blue[1️⃣支持上传视频文件(限制2G) 2️⃣大文件建议直接导入 ./resource/videos 目录])", "Plot Description": "剧情描述 (:blue[可从 https://www.tvmao.com/ 获取])", "Generate Video Keywords": "点击使用AI根据**文案**生成【视频关键】", "Please Enter the Video Subject": "请先填写视频文案", "Generating Video Script and Keywords": "AI正在生成视频文案和关键词...", "Generating Video Keywords": "AI正在生成视频关键词...", "Video Keywords": "视频关键词(:blue[对于长视频配合剧情描述效果更好])", "Video Settings": "**视频设置**", "Video Concat Mode": "视频拼接模式", "Random": "随机拼接(推荐)", "Sequential": "顺序拼接", "Video Ratio": "视频比例", "Portrait": "竖屏 9:16(抖音视频)", "Landscape": "横屏 16:9(西瓜视频)", "Clip Duration": "视频片段最大时长(秒)(**不是视频总长度**,是指每个**合成片段**的长度)", "Number of Videos Generated Simultaneously": "同时生成视频数量", "Audio Settings": "**音频设置**", "Speech Synthesis": "朗读声音(:red[**与文案语言保持一致**。注意:V2版效果更好,但是需要API KEY;SoulVoice 提供高质量中文语音])", "Speech Region": "服务区域 (:red[必填,[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Key": "API Key (:red[必填,密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])", "Speech Volume": "朗读音量(1.0表示100%)", "Speech Rate": "朗读速度(1.0表示1倍速)", "Male": "男性", "Female": "女性", "Background Music": "背景音乐", "No Background Music": "无背景音乐", "Random Background Music": "随机背景音乐", "Custom Background Music": "自定义背景音乐", "Custom Background Music File": "请输入自定义背景音乐的文件路径", "Background Music Volume": "背景音乐音量(0.2表示20%,背景声音不宜过高)", "Subtitle Settings": "**字幕设置**", "Enable Subtitles": "启用字幕(若取消勾选,下面的设置都将不生效)", "Font": "字幕字体", "Position": "字幕位置", "Top": "顶部", "Center": "中间", "Bottom": "底部(推荐)", "Custom": "自定义位置(70,表示离顶部70%的位置)", "Font Size": "字幕大小", "Font Color": "字幕颜色", "Stroke Color": "描边颜色", "Stroke Width": "描边粗细", "Generate Video": "生成视频", "Video Script and Subject Cannot Both Be Empty": "视频主题 和 视频文案,不能同时为空", "Generating Video": "正在生成视频,请稍候...", "Start Generating Video": "开始生成视频", "Video Generation Completed": "视频生成完成", "Video Generation Failed": "视频生成失败", "You can download the generated video from the following links": "你可以从以下链接下载生成的视频", "Basic Settings": "**基础设置** (:blue[点击展开])", "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置,如果 Pexels 无法使用,再选择Pixabay]", "Video LLM Provider": "视频转录大模型", "LLM Provider": "大语言模型", "API Key": "API Key (:red[必填,需要到大模型提供商的后台申请])", "Base Url": "Base Url (可选)", "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])", "Please Enter the LLM API Key": "请先填写大模型 **API Key**", "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**", "Get Help": "一站式 AI 影视解说+自动化剪辑工具\uD83C\uDF89\uD83C\uDF89\uD83C\uDF89\n\n有任何问题或建议,可以加入 **社区频道** 求助或讨论:https://github.com/linyqh/NarratoAI/wiki", "Video Source": "视频来源", "TikTok": "抖音 (TikTok 支持中,敬请期待)", "Bilibili": "哔哩哔哩 (Bilibili 支持中,敬请期待)", "Xiaohongshu": "小红书 (Xiaohongshu 支持中,敬请期待)", "Local file": "本地文件", "Play Voice": "试听语音合成", "Voice Example": "这是一段测试语音合成的示例文本", "Synthesizing Voice": "语音合成中,请稍候...", "TTS Provider": "语音合成提供商", "Hide Log": "隐藏日志", "Upload Local Files": "上传本地文件", "File Uploaded Successfully": "文件上传成功", "timestamp": "时间戳", "Picture description": "图片描述", "Narration": "视频文案", "Rebuild": "重新生成", "Load Video Script": "加载视频脚本", "Speech Pitch": "语调", "Please Select Script File": "请选择脚本文件", "Check Format": "脚本格式检查", "Script Loaded Successfully": "脚本加载成功", "Script format check passed": "脚本格式检查通过", "Script format check failed": "脚本格式检查失败", "Failed to Load Script": "加载脚本失败", "Failed to Save Script": "保存脚本失败", "Script saved successfully": "脚本保存成功", "Video Script": "视频脚本", "Video Quality": "视频质量", "Custom prompt for LLM, leave empty to use default prompt": "自定义提示词,留空则使用默认提示词", "Proxy Settings": "代理设置", "HTTP_PROXY": "HTTP 代理", "HTTPs_PROXY": "HTTPS 代理", "Vision Model Settings": "视频分析模型设置", "Vision Model Provider": "视频分析模型提供商", "Vision API Key": "视频分析 API 密钥", "Vision Base URL": "视频分析接口地址", "Vision Model Name": "视频分析模型名称", "Text Generation Model Settings": "文案生成模型设置", "LLM Model Name": "大语言模型名称", "LLM Model API Key": "大语言模型 API 密钥", "Text Model Provider": "文案生成模型提供商", "Text API Key": "文案生成 API 密钥", "Text Base URL": "文案生成接口地址", "Text Model Name": "文案生成模型名称", "Account ID": "账户 ID", "Skip the first few seconds": "跳过开头多少秒", "Difference threshold": "差异阈值", "Vision processing batch size": "视觉处理批次大小", "Test Connection": "测试连接", "gemini model is available": "Gemini 模型可用", "gemini model is not available": "Gemini 模型不可用", "Unsupported provider": "不支持的提供商", "0: Keep the audio only, 1: Keep the original sound only, 2: Keep the original sound and audio": "0: 仅保留音频,1: 仅保留原声,2: 保留原声和音频", "Text model is not available": "文案生成模型不可用", "Text model is available": "文案生成模型可用", "Upload Script": "上传脚本", "Upload Script File": "上传脚本文件", "Script Uploaded Successfully": "脚本上传成功", "Invalid JSON format": "无效的JSON格式", "Upload failed": "上传失败", "Enable Proxy": "启用代理", "QwenVL model is available": "QwenVL 模型可用", "QwenVL model is not available": "QwenVL 模型不可用", "System settings": "系统设置", "Clear Cache": "清理缓存", "Cache cleared": "缓存清理完成", "storage directory does not exist": "storage目录不存在", "Failed to clear cache": "清理缓存失败", "Clear frames": "清理关键帧", "Clear clip videos": "清理裁剪视频", "Clear tasks": "清理任务", "Directory cleared": "目录清理完成", "Directory does not exist": "目录不存在", "Failed to clear directory": "清理目录失败", "Subtitle Preview": "字幕预览", "One-Click Transcribe": "一键转录", "Transcribing...": "正在转录中...", "Transcription Complete!": "转录完成!", "Transcription Failed. Please try again.": "转录失败,请重试。", "API rate limit exceeded. Please wait about an hour and try again.": "API 调用次数已达到限制,请等待约一小时后再试。", "Resources exhausted. Please try again later.": "资源已耗尽,请稍后再试。", "Transcription Failed": "转录失败", "Short Generate": "短剧混剪", "Generate Short Video Script": "AI生成短剧混剪脚本", "Adjust the volume of the original audio": "调整原始音频的音量", "Original Volume": "视频音量", "Auto Generate": "逐帧解说", "Frame Interval (seconds)": "帧间隔 (秒)", "Frame Interval (seconds) (More keyframes consume more tokens)": "帧间隔 (秒) (更多关键帧消耗更多令牌)", "Batch Size": "批处理大小", "Batch Size (More keyframes consume more tokens)": "批处理大小, 每批处理越少消耗 token 越多", "Short Drama Summary": "短剧解说", "Video Type": "视频类型", "Select/Upload Script": "选择/上传脚本" } } ================================================ FILE: webui/tools/base.py ================================================ import os import requests import streamlit as st from loguru import logger from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from app.config import config # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import create_vision_analyzer as create_vision_analyzer_new # 保留旧的导入以确保向后兼容 from app.utils import gemini_analyzer, qwenvl_analyzer def create_vision_analyzer(provider, api_key, model, base_url): """ 创建视觉分析器实例 - 已重构为使用新的LLM服务架构 Args: provider: 提供商名称 ('gemini', 'gemini(openai)', 'qwenvl', 'siliconflow') api_key: API密钥 model: 模型名称 base_url: API基础URL Returns: 视觉分析器实例 """ try: # 优先使用新的LLM服务架构 return create_vision_analyzer_new(provider, api_key, model, base_url) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现以确保兼容性 if provider == 'gemini': return gemini_analyzer.VisionAnalyzer(model_name=model, api_key=api_key, base_url=base_url) elif provider == 'gemini(openai)': from app.utils.gemini_openai_analyzer import GeminiOpenAIAnalyzer return GeminiOpenAIAnalyzer(model_name=model, api_key=api_key, base_url=base_url) else: # 只传入必要的参数 return qwenvl_analyzer.QwenAnalyzer( model_name=model, api_key=api_key, base_url=base_url ) def get_batch_timestamps(batch_files, prev_batch_files=None): """ 解析一批文件的时间戳范围,支持毫秒级精度 Args: batch_files: 当前批次的文件列表 prev_batch_files: 上一个批次的文件列表,用于处理单张图片的情况 Returns: tuple: (first_timestamp, last_timestamp, timestamp_range) 时间戳格式: HH:MM:SS,mmm (时:分:秒,毫秒) 例如: 00:00:50,100 表示50秒100毫秒 示例文件名格式: keyframe_001253_000050100.jpg 其中 000050100 表示 00:00:50,100 (50秒100毫秒) """ if not batch_files: logger.warning("Empty batch files") return "00:00:00,000", "00:00:00,000", "00:00:00,000-00:00:00,000" def get_frame_files(): """获取首帧和尾帧文件名""" if len(batch_files) == 1 and prev_batch_files and prev_batch_files: # 单张图片情况:使用上一批次最后一帧作为首帧 first = os.path.basename(prev_batch_files[-1]) last = os.path.basename(batch_files[0]) logger.debug(f"单张图片批次,使用上一批次最后一帧作为首帧: {first}") else: first = os.path.basename(batch_files[0]) last = os.path.basename(batch_files[-1]) return first, last def extract_time(filename): """从文件名提取时间信息""" try: # 提取类似 000050100 的时间戳部分 time_str = filename.split('_')[2].replace('.jpg', '') if len(time_str) < 9: # 处理旧格式 time_str = time_str.ljust(9, '0') return time_str except (IndexError, AttributeError) as e: logger.warning(f"Invalid filename format: {filename}, error: {e}") return "000000000" def format_timestamp(time_str): """ 将时间字符串转换为 HH:MM:SS,mmm 格式 Args: time_str: 9位数字字符串,格式为 HHMMSSMMM 例如: 000010000 表示 00时00分10秒000毫秒 000043039 表示 00时00分43秒039毫秒 Returns: str: HH:MM:SS,mmm 格式的时间戳 """ try: if len(time_str) < 9: logger.warning(f"Invalid timestamp format: {time_str}") return "00:00:00,000" # 从时间戳中提取时、分、秒和毫秒 hours = int(time_str[0:2]) # 前2位作为小时 minutes = int(time_str[2:4]) # 第3-4位作为分钟 seconds = int(time_str[4:6]) # 第5-6位作为秒数 milliseconds = int(time_str[6:]) # 最后3位作为毫秒 return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" except ValueError as e: logger.warning(f"时间戳格式转换失败: {time_str}, error: {e}") return "00:00:00,000" # 获取首帧和尾帧文件名 first_frame, last_frame = get_frame_files() # 从文件名中提取时间信息 first_time = extract_time(first_frame) last_time = extract_time(last_frame) # 转换为标准时间戳格式 first_timestamp = format_timestamp(first_time) last_timestamp = format_timestamp(last_time) timestamp_range = f"{first_timestamp}-{last_timestamp}" # logger.debug(f"解析时间戳: {first_frame} -> {first_timestamp}, {last_frame} -> {last_timestamp}") return first_timestamp, last_timestamp, timestamp_range def get_batch_files(keyframe_files, result, batch_size=5): """ 获取当前批次的图片文件 """ batch_start = result['batch_index'] * batch_size batch_end = min(batch_start + batch_size, len(keyframe_files)) return keyframe_files[batch_start:batch_end] ================================================ FILE: webui/tools/generate_script_docu.py ================================================ # 纪录片脚本生成 import os import json import time import asyncio import traceback import streamlit as st from loguru import logger from datetime import datetime from app.config import config from app.utils import utils, video_processor from webui.tools.base import create_vision_analyzer, get_batch_files, get_batch_timestamps def generate_script_docu(params): """ 生成 纪录片 视频脚本 要求: 原视频无字幕无配音 适合场景: 纪录片、动物搞笑解说、荒野建造等 """ progress_bar = st.progress(0) status_text = st.empty() def update_progress(progress: float, message: str = ""): progress_bar.progress(progress) if message: status_text.text(f"🎬 {message}") else: status_text.text(f"📊 进度: {progress}%") try: with st.spinner("正在生成脚本..."): if not params.video_origin_path: st.error("请先选择视频文件") return """ 1. 提取键帧 """ update_progress(10, "正在提取关键帧...") # 创建临时目录用于存储关键帧 keyframes_dir = os.path.join(utils.temp_dir(), "keyframes") video_hash = utils.md5(params.video_origin_path + str(os.path.getmtime(params.video_origin_path))) video_keyframes_dir = os.path.join(keyframes_dir, video_hash) # 检查是否已经提取过关键帧 keyframe_files = [] if os.path.exists(video_keyframes_dir): # 取已有的关键帧文件 for filename in sorted(os.listdir(video_keyframes_dir)): if filename.endswith('.jpg'): keyframe_files.append(os.path.join(video_keyframes_dir, filename)) if keyframe_files: logger.info(f"使用已缓存的关键帧: {video_keyframes_dir}") st.info(f"✅ 使用已缓存关键帧,共 {len(keyframe_files)} 帧") update_progress(20, f"使用已缓存关键帧,共 {len(keyframe_files)} 帧") # 如果没有缓存的关键帧,则进行提取 if not keyframe_files: try: # 确保目录存在 os.makedirs(video_keyframes_dir, exist_ok=True) # 初始化视频处理器 processor = video_processor.VideoProcessor(params.video_origin_path) # 显示视频信息 st.info(f"📹 视频信息: {processor.width}x{processor.height}, {processor.fps:.1f}fps, {processor.duration:.1f}秒") # 处理视频并提取关键帧 - 直接使用超级兼容性方案 update_progress(15, "正在提取关键帧(使用超级兼容性方案)...") try: # 使用优化的关键帧提取方法 processor.extract_frames_by_interval_ultra_compatible( output_dir=video_keyframes_dir, interval_seconds=st.session_state.get('frame_interval_input'), ) except Exception as extract_error: logger.error(f"关键帧提取失败: {extract_error}") # 提供详细的错误信息和解决建议 error_msg = str(extract_error) if "权限" in error_msg or "permission" in error_msg.lower(): suggestion = "建议:检查输出目录权限,或更换输出位置" elif "空间" in error_msg or "space" in error_msg.lower(): suggestion = "建议:检查磁盘空间是否足够" else: suggestion = "建议:检查视频文件是否损坏,或尝试转换为标准格式" raise Exception(f"关键帧提取失败: {error_msg}\n{suggestion}") # 获取所有关键文件路径 for filename in sorted(os.listdir(video_keyframes_dir)): if filename.endswith('.jpg'): keyframe_files.append(os.path.join(video_keyframes_dir, filename)) if not keyframe_files: # 检查目录中是否有其他文件 all_files = os.listdir(video_keyframes_dir) logger.error(f"关键帧目录内容: {all_files}") raise Exception("未提取到任何关键帧文件,请检查视频文件格式") update_progress(20, f"关键帧提取完成,共 {len(keyframe_files)} 帧") st.success(f"✅ 成功提取 {len(keyframe_files)} 个关键帧") except Exception as e: # 如果提取失败,清理创建的目录 try: if os.path.exists(video_keyframes_dir): import shutil shutil.rmtree(video_keyframes_dir) except Exception as cleanup_err: logger.error(f"清理失败的关键帧目录时出错: {cleanup_err}") raise Exception(f"关键帧提取失败: {str(e)}") """ 2. 视觉分析(批量分析每一帧) """ # 最佳实践:使用 get() 的默认值参数 + 从 config 获取备用值 vision_llm_provider = ( st.session_state.get('vision_llm_provider') or config.app.get('vision_llm_provider', 'litellm') ).lower() logger.info(f"使用 {vision_llm_provider.upper()} 进行视觉分析") try: # ===================初始化视觉分析器=================== update_progress(30, "正在初始化视觉分析器...") # 使用统一的配置键格式获取配置(支持所有 provider) vision_api_key = ( st.session_state.get(f'vision_{vision_llm_provider}_api_key') or config.app.get(f'vision_{vision_llm_provider}_api_key') ) vision_model = ( st.session_state.get(f'vision_{vision_llm_provider}_model_name') or config.app.get(f'vision_{vision_llm_provider}_model_name') ) vision_base_url = ( st.session_state.get(f'vision_{vision_llm_provider}_base_url') or config.app.get(f'vision_{vision_llm_provider}_base_url', '') ) # 验证必需配置 if not vision_api_key or not vision_model: raise ValueError( f"未配置 {vision_llm_provider} 的 API Key 或模型名称。" f"请在设置页面配置 vision_{vision_llm_provider}_api_key 和 vision_{vision_llm_provider}_model_name" ) # 创建视觉分析器实例(使用统一接口) llm_params = { "vision_provider": vision_llm_provider, "vision_api_key": vision_api_key, "vision_model_name": vision_model, "vision_base_url": vision_base_url, } logger.debug(f"视觉分析器配置: provider={vision_llm_provider}, model={vision_model}") analyzer = create_vision_analyzer( provider=vision_llm_provider, api_key=vision_api_key, model=vision_model, base_url=vision_base_url ) update_progress(40, "正在分析关键帧...") # ===================创建异步事件循环=================== loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # 执行异步分析 vision_batch_size = st.session_state.get('vision_batch_size') or config.frames.get("vision_batch_size") vision_analysis_prompt = """ 我提供了 %s 张视频帧,它们按时间顺序排列,代表一个连续的视频片段。请仔细分析每一帧的内容,并关注帧与帧之间的变化,以理解整个片段的活动。 首先,请详细描述每一帧的关键视觉信息(包含:主要内容、人物、动作和场景)。 然后,基于所有帧的分析,请用**简洁的语言**总结整个视频片段中发生的主要活动或事件流程。 请务必使用 JSON 格式输出你的结果。JSON 结构应如下: { "frame_observations": [ { "frame_number": 1, // 或其他标识帧的方式 "observation": "描述每张视频帧中的主要内容、人物、动作和场景。" }, // ... 更多帧的观察 ... ], "overall_activity_summary": "在这里填写你总结的整个片段的主要活动,保持简洁。" } 请务必不要遗漏视频帧,我提供了 %s 张视频帧,frame_observations 必须包含 %s 个元素 请只返回 JSON 字符串,不要包含任何其他解释性文字。 """ results = loop.run_until_complete( analyzer.analyze_images( images=keyframe_files, prompt=vision_analysis_prompt, batch_size=vision_batch_size ) ) loop.close() """ 3. 处理分析结果(格式化为 json 数据) """ # ===================处理分析结果=================== update_progress(60, "正在整理分析结果...") # 合并所有批次的分析结果 frame_analysis = "" merged_frame_observations = [] # 合并所有批次的帧观察 overall_activity_summaries = [] # 合并所有批次的整体总结 prev_batch_files = None frame_counter = 1 # 初始化帧计数器,用于给所有帧分配连续的序号 # 确保分析目录存在 analysis_dir = os.path.join(utils.storage_dir(), "temp", "analysis") os.makedirs(analysis_dir, exist_ok=True) origin_res = os.path.join(analysis_dir, "frame_analysis.json") with open(origin_res, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) # 开始处理 for result in results: if 'error' in result: logger.warning(f"批次 {result['batch_index']} 处理出现警告: {result['error']}") continue # 获取当前批次的文件列表 batch_files = get_batch_files(keyframe_files, result, vision_batch_size) # 获取批次的时间戳范围 first_timestamp, last_timestamp, timestamp_range = get_batch_timestamps(batch_files, prev_batch_files) # 解析响应中的JSON数据 response_text = result['response'] try: # 处理可能包含```json```格式的响应 if "```json" in response_text: json_content = response_text.split("```json")[1].split("```")[0].strip() elif "```" in response_text: json_content = response_text.split("```")[1].split("```")[0].strip() else: json_content = response_text.strip() response_data = json.loads(json_content) # 提取frame_observations和overall_activity_summary if "frame_observations" in response_data: frame_obs = response_data["frame_observations"] overall_summary = response_data.get("overall_activity_summary", "") # 添加时间戳信息到每个帧观察 for i, obs in enumerate(frame_obs): if i < len(batch_files): # 从文件名中提取时间戳 file_path = batch_files[i] file_name = os.path.basename(file_path) # 提取时间戳字符串 (格式如: keyframe_000675_000027000.jpg) # 格式解析: keyframe_帧序号_毫秒时间戳.jpg timestamp_parts = file_name.split('_') if len(timestamp_parts) >= 3: timestamp_str = timestamp_parts[-1].split('.')[0] try: # 修正时间戳解析逻辑 # 格式为000100000,表示00:01:00,000,即1分钟 # 需要按照对应位数进行解析: # 前两位是小时,中间两位是分钟,后面是秒和毫秒 if len(timestamp_str) >= 9: # 确保格式正确 hours = int(timestamp_str[0:2]) minutes = int(timestamp_str[2:4]) seconds = int(timestamp_str[4:6]) milliseconds = int(timestamp_str[6:9]) # 计算总秒数 timestamp_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000 formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳 else: # 兼容旧的解析方式 timestamp_seconds = int(timestamp_str) / 1000 # 转换为秒 formatted_time = utils.format_time(timestamp_seconds) # 格式化时间戳 except ValueError: logger.warning(f"无法解析时间戳: {timestamp_str}") timestamp_seconds = 0 formatted_time = "00:00:00,000" else: logger.warning(f"文件名格式不符合预期: {file_name}") timestamp_seconds = 0 formatted_time = "00:00:00,000" # 添加额外信息到帧观察 obs["frame_path"] = file_path obs["timestamp"] = formatted_time obs["timestamp_seconds"] = timestamp_seconds obs["batch_index"] = result['batch_index'] # 使用全局递增的帧计数器替换原始的frame_number if "frame_number" in obs: obs["original_frame_number"] = obs["frame_number"] # 保留原始编号作为参考 obs["frame_number"] = frame_counter # 赋值连续的帧编号 frame_counter += 1 # 增加帧计数器 # 添加到合并列表 merged_frame_observations.append(obs) # 添加批次整体总结信息 if overall_summary: # 从文件名中提取时间戳数值 first_time_str = first_timestamp.split('_')[-1].split('.')[0] last_time_str = last_timestamp.split('_')[-1].split('.')[0] # 转换为毫秒并计算持续时间(秒) try: # 修正解析逻辑,与上面相同的方式解析时间戳 if len(first_time_str) >= 9 and len(last_time_str) >= 9: # 解析第一个时间戳 first_hours = int(first_time_str[0:2]) first_minutes = int(first_time_str[2:4]) first_seconds = int(first_time_str[4:6]) first_ms = int(first_time_str[6:9]) first_time_seconds = first_hours * 3600 + first_minutes * 60 + first_seconds + first_ms / 1000 # 解析第二个时间戳 last_hours = int(last_time_str[0:2]) last_minutes = int(last_time_str[2:4]) last_seconds = int(last_time_str[4:6]) last_ms = int(last_time_str[6:9]) last_time_seconds = last_hours * 3600 + last_minutes * 60 + last_seconds + last_ms / 1000 batch_duration = last_time_seconds - first_time_seconds else: # 兼容旧的解析方式 first_time_ms = int(first_time_str) last_time_ms = int(last_time_str) batch_duration = (last_time_ms - first_time_ms) / 1000 except ValueError: # 使用 utils.time_to_seconds 函数处理格式化的时间戳 first_time_seconds = utils.time_to_seconds(first_time_str.replace('_', ':').replace('-', ',')) last_time_seconds = utils.time_to_seconds(last_time_str.replace('_', ':').replace('-', ',')) batch_duration = last_time_seconds - first_time_seconds overall_activity_summaries.append({ "batch_index": result['batch_index'], "time_range": f"{first_timestamp}-{last_timestamp}", "duration_seconds": batch_duration, "summary": overall_summary }) except Exception as e: logger.error(f"解析批次 {result['batch_index']} 的响应数据失败: {str(e)}") # 添加原始响应作为回退 frame_analysis += f"\n=== {first_timestamp}-{last_timestamp} ===\n" frame_analysis += response_text frame_analysis += "\n" # 更新上一个批次的文件 prev_batch_files = batch_files # 将合并后的结果转为JSON字符串 merged_results = { "frame_observations": merged_frame_observations, "overall_activity_summaries": overall_activity_summaries } # 使用当前时间创建文件名 now = datetime.now() timestamp_str = now.strftime("%Y%m%d_%H%M") # 保存完整的分析结果为JSON analysis_filename = f"frame_analysis_{timestamp_str}.json" analysis_json_path = os.path.join(analysis_dir, analysis_filename) with open(analysis_json_path, 'w', encoding='utf-8') as f: json.dump(merged_results, f, ensure_ascii=False, indent=2) logger.info(f"分析结果已保存到: {analysis_json_path}") """ 4. 生成文案 """ logger.info("开始生成解说文案") update_progress(80, "正在生成解说文案...") from app.services.generate_narration_script import parse_frame_analysis_to_markdown, generate_narration # 从配置中获取文本生成相关配置 text_provider = config.app.get('text_llm_provider', 'gemini').lower() text_api_key = config.app.get(f'text_{text_provider}_api_key') text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') llm_params.update({ "text_provider": text_provider, "text_api_key": text_api_key, "text_model_name": text_model, "text_base_url": text_base_url }) # 整理帧分析数据 markdown_output = parse_frame_analysis_to_markdown(analysis_json_path) # 生成解说文案 narration = generate_narration( markdown_output, text_api_key, base_url=text_base_url, model=text_model ) # 使用增强的JSON解析器 from webui.tools.generate_short_summary import parse_and_fix_json narration_data = parse_and_fix_json(narration) if not narration_data or 'items' not in narration_data: logger.error(f"解说文案JSON解析失败,原始内容: {narration[:200]}...") raise Exception("解说文案格式错误,无法解析JSON或缺少items字段") narration_dict = narration_data['items'] # 为 narration_dict 中每个 item 新增一个 OST: 2 的字段, 代表保留原声和配音 narration_dict = [{**item, "OST": 2} for item in narration_dict] logger.info(f"解说文案生成完成,共 {len(narration_dict)} 个片段") # 结果转换为JSON字符串 script = json.dumps(narration_dict, ensure_ascii=False, indent=2) except Exception as e: logger.exception(f"大模型处理过程中发生错误\n{traceback.format_exc()}") raise Exception(f"分析失败: {str(e)}") if script is None: st.error("生成脚本失败,请检查日志") st.stop() logger.info(f"纪录片解说脚本生成完成") if isinstance(script, list): st.session_state['video_clip_json'] = script elif isinstance(script, str): st.session_state['video_clip_json'] = json.loads(script) update_progress(100, "脚本生成完成") time.sleep(0.1) progress_bar.progress(100) status_text.text("🎉 脚本生成完成!") st.success("✅ 视频脚本生成成功!") except Exception as err: st.error(f"❌ 生成过程中发生错误: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") finally: time.sleep(2) progress_bar.empty() status_text.empty() ================================================ FILE: webui/tools/generate_script_short.py ================================================ import os import json import time import traceback import streamlit as st from loguru import logger from app.config import config from app.services.upload_validation import ensure_existing_file, InputValidationError from app.utils import utils def generate_script_short(tr, params, custom_clips=5): """ 生成短视频脚本 Args: tr: 翻译函数 params: 视频参数对象 custom_clips: 自定义片段数量,默认为5 """ progress_bar = st.progress(0) status_text = st.empty() def update_progress(progress: float, message: str = ""): progress_bar.progress(progress) if message: status_text.text(f"{progress}% - {message}") else: status_text.text(f"进度: {progress}%") try: with st.spinner("正在生成脚本..."): # ========== 严格验证:必须上传视频和字幕(与短剧解说保持一致)========== # 1. 验证视频文件 video_path = getattr(params, "video_origin_path", None) if not video_path or not str(video_path).strip(): st.error("请先选择视频文件") st.stop() try: ensure_existing_file( str(video_path), label="视频", allowed_exts=(".mp4", ".mov", ".avi", ".flv", ".mkv"), ) except InputValidationError as e: st.error(str(e)) st.stop() # 2. 验证字幕文件(移除推断逻辑,必须上传) subtitle_path = st.session_state.get("subtitle_path") if not subtitle_path or not str(subtitle_path).strip(): st.error("请先上传字幕文件") st.stop() try: subtitle_path = ensure_existing_file( str(subtitle_path), label="字幕", allowed_exts=(".srt",), ) except InputValidationError as e: st.error(str(e)) st.stop() logger.info(f"使用用户上传的字幕文件: {subtitle_path}") # ========== 获取 LLM 配置 ========== text_provider = config.app.get('text_llm_provider', 'gemini').lower() text_api_key = config.app.get(f'text_{text_provider}_api_key') text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') vision_llm_provider = st.session_state.get('vision_llm_providers') or config.app.get('vision_llm_provider', 'gemini') vision_llm_provider = vision_llm_provider.lower() vision_api_key = st.session_state.get(f'vision_{vision_llm_provider}_api_key') or config.app.get(f'vision_{vision_llm_provider}_api_key', "") vision_model = st.session_state.get(f'vision_{vision_llm_provider}_model_name') or config.app.get(f'vision_{vision_llm_provider}_model_name', "") vision_base_url = st.session_state.get(f'vision_{vision_llm_provider}_base_url') or config.app.get(f'vision_{vision_llm_provider}_base_url', "") update_progress(20, "开始准备生成脚本") # ========== 调用后端生成脚本 ========== from app.services.SDP.generate_script_short import generate_script_result output_path = os.path.join(utils.script_dir(), "merged_subtitle.json") subtitle_content = st.session_state.get("subtitle_content") subtitle_kwargs = ( {"subtitle_content": str(subtitle_content)} if subtitle_content is not None and str(subtitle_content).strip() else {"subtitle_file_path": subtitle_path} ) result = generate_script_result( api_key=text_api_key, model_name=text_model, output_path=output_path, base_url=text_base_url, custom_clips=custom_clips, provider=text_provider, **subtitle_kwargs, ) if result.get("status") != "success": st.error(result.get("message", "生成脚本失败,请检查日志")) st.stop() script = result.get("script") logger.info(f"脚本生成完成 {json.dumps(script, ensure_ascii=False, indent=4)}") if isinstance(script, list): st.session_state['video_clip_json'] = script elif isinstance(script, str): st.session_state['video_clip_json'] = json.loads(script) update_progress(80, "脚本生成完成") time.sleep(0.1) progress_bar.progress(100) status_text.text("脚本生成完成!") st.success("视频脚本生成成功!") except Exception as err: progress_bar.progress(100) st.error(f"生成过程中发生错误: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") ================================================ FILE: webui/tools/generate_short_summary.py ================================================ #!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @Project: NarratoAI @File : 短剧解说脚本生成 @Author : 小林同学 @Date : 2025/5/10 下午10:26 ''' import os import json import time import traceback import streamlit as st from loguru import logger from app.config import config from app.services.SDE.short_drama_explanation import analyze_subtitle, generate_narration_script from app.services.subtitle_text import read_subtitle_text # 导入新的LLM服务模块 - 确保提供商被注册 import app.services.llm # 这会触发提供商注册 from app.services.llm.migration_adapter import SubtitleAnalyzerAdapter import re def parse_and_fix_json(json_string): """ 解析并修复JSON字符串 Args: json_string: 待解析的JSON字符串 Returns: dict: 解析后的字典,如果解析失败返回None """ if not json_string or not json_string.strip(): logger.error("JSON字符串为空") return None # 清理字符串 json_string = json_string.strip() # 尝试直接解析 try: return json.loads(json_string) except json.JSONDecodeError as e: logger.warning(f"直接JSON解析失败: {e}") # 尝试修复双大括号问题(LLM生成的常见问题) try: # 将双大括号替换为单大括号 fixed_braces = json_string.replace('{{', '{').replace('}}', '}') logger.info("修复双大括号格式") return json.loads(fixed_braces) except json.JSONDecodeError: pass # 尝试提取JSON部分 try: # 查找JSON代码块 json_match = re.search(r'```json\s*(.*?)\s*```', json_string, re.DOTALL) if json_match: json_content = json_match.group(1).strip() logger.info("从代码块中提取JSON内容") return json.loads(json_content) except json.JSONDecodeError: pass # 尝试查找大括号包围的内容 try: # 查找第一个 { 到最后一个 } 的内容 start_idx = json_string.find('{') end_idx = json_string.rfind('}') if start_idx != -1 and end_idx != -1 and end_idx > start_idx: json_content = json_string[start_idx:end_idx+1] logger.info("提取大括号包围的JSON内容") return json.loads(json_content) except json.JSONDecodeError: pass # 尝试综合修复JSON格式问题 try: fixed_json = json_string # 1. 修复双大括号问题 fixed_json = fixed_json.replace('{{', '{').replace('}}', '}') # 2. 提取JSON内容(如果有其他文本包围) start_idx = fixed_json.find('{') end_idx = fixed_json.rfind('}') if start_idx != -1 and end_idx != -1 and end_idx > start_idx: fixed_json = fixed_json[start_idx:end_idx+1] # 3. 移除注释 fixed_json = re.sub(r'#.*', '', fixed_json) fixed_json = re.sub(r'//.*', '', fixed_json) # 4. 移除多余的逗号 fixed_json = re.sub(r',\s*}', '}', fixed_json) fixed_json = re.sub(r',\s*]', ']', fixed_json) # 5. 修复单引号 fixed_json = re.sub(r"'([^']*)':", r'"\1":', fixed_json) # 6. 修复没有引号的属性名 fixed_json = re.sub(r'(\w+)(\s*):', r'"\1"\2:', fixed_json) # 7. 修复重复的引号 fixed_json = re.sub(r'""([^"]*?)""', r'"\1"', fixed_json) logger.info("尝试综合修复JSON格式问题后解析") return json.loads(fixed_json) except json.JSONDecodeError as e: logger.debug(f"综合修复失败: {e}") pass # 如果所有方法都失败,尝试创建一个基本的结构 logger.error(f"所有JSON解析方法都失败,原始内容: {json_string[:200]}...") # 尝试从文本中提取关键信息创建基本结构 try: # 这是一个简单的回退方案 return { "items": [ { "_id": 1, "timestamp": "00:00:00,000-00:00:10,000", "picture": "解析失败,使用默认内容", "narration": json_string[:100] + "..." if len(json_string) > 100 else json_string, "OST": 0 } ] } except Exception: return None def generate_script_short_sunmmary(params, subtitle_path, video_theme, temperature): """ 生成 短剧解说 视频脚本 要求: 提供高质量短剧字幕 适合场景: 短剧 """ progress_bar = st.progress(0) status_text = st.empty() def update_progress(progress: float, message: str = ""): progress_bar.progress(progress) if message: status_text.text(f"{progress}% - {message}") else: status_text.text(f"进度: {progress}%") try: with st.spinner("正在生成脚本..."): if not params.video_origin_path: st.error("请先选择视频文件") return """ 1. 获取字幕 """ update_progress(30, "正在解析字幕...") # 判断字幕文件是否存在 if not os.path.exists(subtitle_path): st.error("字幕文件不存在") return """ 2. 分析字幕总结剧情 - 使用新的LLM服务架构 """ text_provider = config.app.get('text_llm_provider', 'gemini').lower() text_api_key = config.app.get(f'text_{text_provider}_api_key') text_model = config.app.get(f'text_{text_provider}_model_name') text_base_url = config.app.get(f'text_{text_provider}_base_url') # 读取字幕文件内容(无论使用哪种实现都需要) subtitle_content = read_subtitle_text(subtitle_path).text if not subtitle_content: st.error("字幕文件内容为空或无法读取") return try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构进行字幕分析") analyzer = SubtitleAnalyzerAdapter(text_api_key, text_model, text_base_url, text_provider) analysis_result = analyzer.analyze_subtitle(subtitle_content) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现 analysis_result = analyze_subtitle( subtitle_file_path=subtitle_path, api_key=text_api_key, model=text_model, base_url=text_base_url, save_result=True, temperature=temperature, provider=text_provider ) """ 3. 根据剧情生成解说文案 """ if analysis_result["status"] == "success": logger.info("字幕分析成功!") update_progress(60, "正在生成文案...") # 根据剧情生成解说文案 - 使用新的LLM服务架构 try: # 优先使用新的LLM服务架构 logger.info("使用新的LLM服务架构生成解说文案") narration_result = analyzer.generate_narration_script( short_name=video_theme, plot_analysis=analysis_result["analysis"], subtitle_content=subtitle_content, # 传递原始字幕内容 temperature=temperature ) except Exception as e: logger.warning(f"使用新LLM服务失败,回退到旧实现: {str(e)}") # 回退到旧的实现 narration_result = generate_narration_script( short_name=video_theme, plot_analysis=analysis_result["analysis"], subtitle_content=subtitle_content, # 传递原始字幕内容 api_key=text_api_key, model=text_model, base_url=text_base_url, save_result=True, temperature=temperature, provider=text_provider ) if narration_result["status"] == "success": logger.info("\n解说文案生成成功!") logger.info(narration_result["narration_script"]) else: logger.info(f"\n解说文案生成失败: {narration_result['message']}") st.error("生成脚本失败,请检查日志") st.stop() else: logger.error(f"分析失败: {analysis_result['message']}") st.error("生成脚本失败,请检查日志") st.stop() """ 4. 生成文案 """ logger.info("开始准备生成解说文案") # 结果转换为JSON字符串 narration_script = narration_result["narration_script"] # 增强JSON解析,包含错误处理和修复 narration_dict = parse_and_fix_json(narration_script) if narration_dict is None: st.error("生成的解说文案格式错误,无法解析为JSON") logger.error(f"JSON解析失败,原始内容: {narration_script}") st.stop() # 验证JSON结构 if 'items' not in narration_dict: st.error("生成的解说文案缺少必要的'items'字段") logger.error(f"JSON结构错误,缺少items字段: {narration_dict}") st.stop() script = json.dumps(narration_dict['items'], ensure_ascii=False, indent=2) if script is None: st.error("生成脚本失败,请检查日志") st.stop() logger.success(f"剪辑脚本生成完成") if isinstance(script, list): st.session_state['video_clip_json'] = script elif isinstance(script, str): st.session_state['video_clip_json'] = json.loads(script) update_progress(90, "整理输出...") time.sleep(0.1) progress_bar.progress(100) status_text.text("脚本生成完成!") st.success("视频脚本生成成功!") except Exception as err: st.error(f"生成过程中发生错误: {str(err)}") logger.exception(f"生成脚本时发生错误\n{traceback.format_exc()}") finally: time.sleep(2) progress_bar.empty() status_text.empty() ================================================ FILE: webui/utils/cache.py ================================================ import streamlit as st import os import glob from app.utils import utils def get_fonts_cache(font_dir): if 'fonts_cache' not in st.session_state: fonts = [] for root, dirs, files in os.walk(font_dir): for file in files: # 支持常见字体格式,少字体时也能被UI识别 if file.lower().endswith((".ttf", ".ttc", ".otf")): fonts.append(file) fonts.sort() st.session_state['fonts_cache'] = fonts return st.session_state['fonts_cache'] def get_video_files_cache(): if 'video_files_cache' not in st.session_state: video_files = [] for suffix in ["*.mp4", "*.mov", "*.avi", "*.mkv"]: video_files.extend(glob.glob(os.path.join(utils.video_dir(), suffix))) st.session_state['video_files_cache'] = video_files[::-1] return st.session_state['video_files_cache'] def get_songs_cache(song_dir): if 'songs_cache' not in st.session_state: songs = [] for root, dirs, files in os.walk(song_dir): for file in files: if file.endswith(".mp3"): songs.append(file) st.session_state['songs_cache'] = songs return st.session_state['songs_cache'] ================================================ FILE: webui/utils/file_utils.py ================================================ import os import glob import time import platform import shutil from uuid import uuid4 from loguru import logger from app.utils import utils def open_task_folder(root_dir, task_id): """打开任务文件夹 Args: root_dir: 项目根目录 task_id: 任务ID """ try: sys = platform.system() path = os.path.join(root_dir, "storage", "tasks", task_id) if os.path.exists(path): if sys == 'Windows': os.system(f"start {path}") if sys == 'Darwin': os.system(f"open {path}") if sys == 'Linux': os.system(f"xdg-open {path}") except Exception as e: logger.error(f"打开任务文件夹失败: {e}") def cleanup_temp_files(temp_dir, max_age=3600): """清理临时文件 Args: temp_dir: 临时文件目录 max_age: 文件最大保存时间(秒) """ if os.path.exists(temp_dir): for file in os.listdir(temp_dir): file_path = os.path.join(temp_dir, file) try: if os.path.getctime(file_path) < time.time() - max_age: if os.path.isfile(file_path): os.remove(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) logger.debug(f"已清理临时文件: {file_path}") except Exception as e: logger.error(f"清理临时文件失败: {file_path}, 错误: {e}") def get_file_list(directory, file_types=None, sort_by='ctime', reverse=True): """获取指定目录下的文件列表 Args: directory: 目录路径 file_types: 文件类型列表,如 ['.mp4', '.mov'] sort_by: 排序方式,支持 'ctime'(创建时间), 'mtime'(修改时间), 'size'(文件大小), 'name'(文件名) reverse: 是否倒序排序 Returns: list: 文件信息列表 """ if not os.path.exists(directory): return [] files = [] if file_types: for file_type in file_types: files.extend(glob.glob(os.path.join(directory, f"*{file_type}"))) else: files = glob.glob(os.path.join(directory, "*")) file_list = [] for file_path in files: try: file_stat = os.stat(file_path) file_info = { "name": os.path.basename(file_path), "path": file_path, "size": file_stat.st_size, "ctime": file_stat.st_ctime, "mtime": file_stat.st_mtime } file_list.append(file_info) except Exception as e: logger.error(f"获取文件信息失败: {file_path}, 错误: {e}") # 排序 if sort_by in ['ctime', 'mtime', 'size', 'name']: file_list.sort(key=lambda x: x.get(sort_by, ''), reverse=reverse) return file_list def save_uploaded_file(uploaded_file, save_dir, allowed_types=None): """保存上传的文件 Args: uploaded_file: StreamlitUploadedFile对象 save_dir: 保存目录 allowed_types: 允许的文件类型列表,如 ['.mp4', '.mov'] Returns: str: 保存后的文件路径,失败返回None """ try: if not os.path.exists(save_dir): os.makedirs(save_dir) file_name, file_extension = os.path.splitext(uploaded_file.name) # 检查文件类型 if allowed_types and file_extension.lower() not in allowed_types: logger.error(f"不支持的文件类型: {file_extension}") return None # 如果文件已存在,添加时间戳 save_path = os.path.join(save_dir, uploaded_file.name) if os.path.exists(save_path): timestamp = time.strftime("%Y%m%d%H%M%S") new_file_name = f"{file_name}_{timestamp}{file_extension}" save_path = os.path.join(save_dir, new_file_name) # 保存文件 with open(save_path, "wb") as f: f.write(uploaded_file.read()) logger.info(f"文件保存成功: {save_path}") return save_path except Exception as e: logger.error(f"保存上传文件失败: {e}") return None def create_temp_file(prefix='tmp', suffix='', directory=None): """创建临时文件 Args: prefix: 文件名前缀 suffix: 文件扩展名 directory: 临时文件目录,默认使用系统临时目录 Returns: str: 临时文件路径 """ try: if directory is None: directory = utils.storage_dir("temp", create=True) if not os.path.exists(directory): os.makedirs(directory) temp_file = os.path.join(directory, f"{prefix}-{str(uuid4())}{suffix}") return temp_file except Exception as e: logger.error(f"创建临时文件失败: {e}") return None def get_file_size(file_path, format='MB'): """获取文件大小 Args: file_path: 文件路径 format: 返回格式,支持 'B', 'KB', 'MB', 'GB' Returns: float: 文件大小 """ try: size_bytes = os.path.getsize(file_path) if format.upper() == 'B': return size_bytes elif format.upper() == 'KB': return size_bytes / 1024 elif format.upper() == 'MB': return size_bytes / (1024 * 1024) elif format.upper() == 'GB': return size_bytes / (1024 * 1024 * 1024) else: return size_bytes except Exception as e: logger.error(f"获取文件大小失败: {file_path}, 错误: {e}") return 0 def ensure_directory(directory): """确保目录存在,如果不存在则创建 Args: directory: 目录路径 Returns: bool: 是否成功 """ try: if not os.path.exists(directory): os.makedirs(directory) return True except Exception as e: logger.error(f"创建目录失败: {directory}, 错误: {e}") return False def create_zip(files: list, zip_path: str, base_dir: str = None, folder_name: str = "demo") -> bool: """ 创建zip文件 Args: files: 要打包的文件列表 zip_path: zip文件保存路径 base_dir: 基础目录,用于保持目录结构 folder_name: zip解压后的文件夹名称,默认为frames Returns: bool: 是否成功 """ try: import zipfile # 确保目标目录存在 os.makedirs(os.path.dirname(zip_path), exist_ok=True) with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: for file in files: if not os.path.exists(file): logger.warning(f"文件不存在,跳过: {file}") continue # 计算文件在zip中的路径,添加folder_name作为前缀目录 if base_dir: arcname = os.path.join(folder_name, os.path.relpath(file, base_dir)) else: arcname = os.path.join(folder_name, os.path.basename(file)) try: zipf.write(file, arcname) except Exception as e: logger.error(f"添加文件到zip失败: {file}, 错误: {e}") continue return True except Exception as e: logger.error(f"创建zip文件失败: {e}") return False ================================================ FILE: webui/utils/vision_analyzer.py ================================================ import logging from typing import List, Dict, Any, Optional from app.utils import gemini_analyzer, qwenvl_analyzer logger = logging.getLogger(__name__) class VisionAnalyzer: def __init__(self): self.provider = None self.api_key = None self.model = None self.base_url = None self.analyzer = None def initialize_gemini(self, api_key: str, model: str, base_url: str) -> None: """ 初始化Gemini视觉分析器 Args: api_key: Gemini API密钥 model: 模型名称 base_url: API基础URL """ self.provider = 'gemini' self.api_key = api_key self.model = model self.base_url = base_url self.analyzer = gemini_analyzer.VisionAnalyzer( model_name=model, api_key=api_key ) def initialize_qwenvl(self, api_key: str, model: str, base_url: str) -> None: """ 初始化QwenVL视觉分析器 Args: api_key: 阿里云API密钥 model: 模型名称 base_url: API基础URL """ self.provider = 'qwenvl' self.api_key = api_key self.model = model self.base_url = base_url self.analyzer = qwenvl_analyzer.QwenAnalyzer( model_name=model, api_key=api_key ) async def analyze_images(self, images: List[str], prompt: str, batch_size: int = 5) -> Dict[str, Any]: """ 分析图片内容 Args: images: 图片路径列表 prompt: 分析提示词 batch_size: 每批处理的图片数量,默认为5 Returns: Dict: 分析结果 """ if not self.analyzer: raise ValueError("未初始化视觉分析器") return await self.analyzer.analyze_images( images=images, prompt=prompt, batch_size=batch_size ) def create_vision_analyzer(provider: str, **kwargs) -> VisionAnalyzer: """ 创建视觉分析器实例 Args: provider: 提供商名称 ('gemini' 或 'qwenvl') **kwargs: 提供商特定的配置参数 Returns: VisionAnalyzer: 配置好的视觉分析器实例 """ analyzer = VisionAnalyzer() if provider.lower() == 'gemini': analyzer.initialize_gemini( api_key=kwargs.get('api_key'), model=kwargs.get('model'), base_url=kwargs.get('base_url') ) elif provider.lower() == 'qwenvl': analyzer.initialize_qwenvl( api_key=kwargs.get('api_key'), model=kwargs.get('model'), base_url=kwargs.get('base_url') ) else: raise ValueError(f"不支持的视觉分析提供商: {provider}") return analyzer ================================================ FILE: webui.py ================================================ import streamlit as st import os import sys from loguru import logger from app.config import config from webui.components import basic_settings, video_settings, audio_settings, subtitle_settings, script_settings, \ system_settings # from webui.utils import cache, file_utils from app.utils import utils from app.utils import ffmpeg_utils from app.models.schema import VideoClipParams, VideoAspect # 初始化配置 - 必须是第一个 Streamlit 命令 st.set_page_config( page_title="NarratoAI", page_icon="📽️", layout="wide", initial_sidebar_state="auto", menu_items={ "Report a bug": "https://github.com/linyqh/NarratoAI/issues", 'About': f"# Narrato:blue[AI] :sunglasses: 📽️ \n #### Version: v{config.project_version} \n " f"自动化影视解说视频详情请移步:https://github.com/linyqh/NarratoAI" }, ) # 设置页面样式 hide_streamlit_style = """ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) def init_log(): """初始化日志配置""" from loguru import logger logger.remove() _lvl = "INFO" # 改为 INFO 级别,过滤掉 DEBUG 日志 def format_record(record): # 简化日志格式化处理,不尝试按特定字符串过滤torch相关内容 file_path = record["file"].path relative_path = os.path.relpath(file_path, config.root_dir) record["file"].path = f"./{relative_path}" record['message'] = record['message'].replace(config.root_dir, ".") _format = '{time:%Y-%m-%d %H:%M:%S} | ' + \ '{level} | ' + \ '"{file.path}:{line}": {function} ' + \ '- {message}' + "\n" return _format # 添加日志过滤器 def log_filter(record): """过滤不必要的日志消息""" # 过滤掉启动时的噪音日志(即使在 DEBUG 模式下也可以选择过滤) ignore_patterns = [ "Examining the path of torch.classes raised", "torch.cuda.is_available()", "CUDA initialization" ] return not any(pattern in record["message"] for pattern in ignore_patterns) logger.add( sys.stdout, level=_lvl, format=format_record, colorize=True, filter=log_filter ) # 应用启动后,可以再添加更复杂的过滤器 def setup_advanced_filters(): """在应用完全启动后设置高级过滤器""" try: for handler_id in logger._core.handlers: logger.remove(handler_id) # 重新添加带有高级过滤的处理器 def advanced_filter(record): """更复杂的过滤器,在应用启动后安全使用""" ignore_messages = [ "Examining the path of torch.classes raised", "torch.cuda.is_available()", "CUDA initialization" ] return not any(msg in record["message"] for msg in ignore_messages) logger.add( sys.stdout, level=_lvl, format=format_record, colorize=True, filter=advanced_filter ) except Exception as e: # 如果过滤器设置失败,确保日志仍然可用 logger.add( sys.stdout, level=_lvl, format=format_record, colorize=True ) logger.error(f"设置高级日志过滤器失败: {e}") # 将高级过滤器设置放到启动主逻辑后 import threading threading.Timer(5.0, setup_advanced_filters).start() def init_global_state(): """初始化全局状态""" if 'video_clip_json' not in st.session_state: st.session_state['video_clip_json'] = [] if 'video_plot' not in st.session_state: st.session_state['video_plot'] = '' if 'ui_language' not in st.session_state: st.session_state['ui_language'] = config.ui.get("language", utils.get_system_locale()) # 移除subclip_videos初始化 - 现在使用统一裁剪策略 def tr(key): """翻译函数""" i18n_dir = os.path.join(os.path.dirname(__file__), "webui", "i18n") locales = utils.load_locales(i18n_dir) loc = locales.get(st.session_state['ui_language'], {}) return loc.get("Translation", {}).get(key, key) def render_generate_button(): """渲染生成按钮和处理逻辑""" if st.button(tr("Generate Video"), use_container_width=True, type="primary"): from app.services import task as tm from app.services import state as sm from app.models import const import threading import time import uuid config.save_config() # 移除task_id检查 - 现在使用统一裁剪策略,不再需要预裁剪 # 直接检查必要的文件是否存在 if not st.session_state.get('video_clip_json_path'): st.error(tr("脚本文件不能为空")) return if not st.session_state.get('video_origin_path'): st.error(tr("视频文件不能为空")) return # 获取所有参数 script_params = script_settings.get_script_params() video_params = video_settings.get_video_params() audio_params = audio_settings.get_audio_params() subtitle_params = subtitle_settings.get_subtitle_params() # 合并所有参数 all_params = { **script_params, **video_params, **audio_params, **subtitle_params } # 创建参数对象 params = VideoClipParams(**all_params) # 生成一个新的task_id用于本次处理 task_id = str(uuid.uuid4()) # 创建进度条 progress_bar = st.progress(0) status_text = st.empty() def run_task(): try: tm.start_subclip_unified( task_id=task_id, params=params ) except Exception as e: logger.error(f"任务执行失败: {e}") sm.state.update_task(task_id, state=const.TASK_STATE_FAILED, message=str(e)) # 在新线程中启动任务 thread = threading.Thread(target=run_task) thread.start() # 轮询任务状态 while True: task = sm.state.get_task(task_id) if task: progress = task.get("progress", 0) state = task.get("state") # 更新进度条 progress_bar.progress(progress / 100) status_text.text(f"Processing... {progress}%") if state == const.TASK_STATE_COMPLETE: status_text.text(tr("视频生成完成")) progress_bar.progress(1.0) # 显示结果 video_files = task.get("videos", []) try: if video_files: player_cols = st.columns(len(video_files) * 2 + 1) for i, url in enumerate(video_files): player_cols[i * 2 + 1].video(url) except Exception as e: logger.error(f"播放视频失败: {e}") st.success(tr("视频生成完成")) break elif state == const.TASK_STATE_FAILED: st.error(f"任务失败: {task.get('message', 'Unknown error')}") break time.sleep(0.5) def main(): """主函数""" init_log() init_global_state() # ===== 显式注册 LLM 提供商(最佳实践)===== # 在应用启动时立即注册,确保所有 LLM 功能可用 if 'llm_providers_registered' not in st.session_state: try: from app.services.llm.providers import register_all_providers register_all_providers() st.session_state['llm_providers_registered'] = True logger.info("✅ LLM 提供商注册成功") except Exception as e: logger.error(f"❌ LLM 提供商注册失败: {str(e)}") import traceback logger.error(traceback.format_exc()) st.error(f"⚠️ LLM 初始化失败: {str(e)}\n\n请检查配置文件和依赖是否正确安装。") # 不抛出异常,允许应用继续运行(但 LLM 功能不可用) # 检测FFmpeg硬件加速,但只打印一次日志(使用 session_state 持久化) if 'hwaccel_logged' not in st.session_state: st.session_state['hwaccel_logged'] = False hwaccel_info = ffmpeg_utils.detect_hardware_acceleration() if not st.session_state['hwaccel_logged']: if hwaccel_info["available"]: logger.info(f"FFmpeg硬件加速检测结果: 可用 | 类型: {hwaccel_info['type']} | 编码器: {hwaccel_info['encoder']} | 独立显卡: {hwaccel_info['is_dedicated_gpu']}") else: logger.warning(f"FFmpeg硬件加速不可用: {hwaccel_info['message']}, 将使用CPU软件编码") st.session_state['hwaccel_logged'] = True # 仅初始化基本资源,避免过早地加载依赖PyTorch的资源 # 检查是否能分解utils.init_resources()为基本资源和高级资源(如依赖PyTorch的资源) try: utils.init_resources() except Exception as e: logger.warning(f"资源初始化时出现警告: {e}") st.title(f"Narrato:blue[AI]:sunglasses: 📽️") st.write(tr("Get Help")) # 首先渲染不依赖PyTorch的UI部分 # 渲染基础设置面板 basic_settings.render_basic_settings(tr) # 渲染主面板 panel = st.columns(3) with panel[0]: script_settings.render_script_panel(tr) with panel[1]: audio_settings.render_audio_panel(tr) with panel[2]: video_settings.render_video_panel(tr) subtitle_settings.render_subtitle_panel(tr) # 放到最后渲染可能使用PyTorch的部分 # 渲染系统设置面板 with panel[2]: system_settings.render_system_panel(tr) # 放到最后渲染生成按钮和处理逻辑 render_generate_button() if __name__ == "__main__": main()