Repository: z-mio/parse_hub_bot Branch: main Commit: 252be67dc4ec Files: 29 Total size: 107.9 KB Directory structure: gitextract_bn3fr2tv/ ├── .dockerignore ├── .github/ │ └── workflows/ │ └── docker-image.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── bot.py ├── core/ │ ├── __init__.py │ ├── config.py │ ├── platform_config.py │ └── watchdog.py ├── log.py ├── plugins/ │ ├── __init__.py │ ├── filters.py │ ├── helpers.py │ ├── inline_parse.py │ ├── parse.py │ └── start.py ├── pyproject.toml ├── services/ │ ├── __init__.py │ ├── cache.py │ ├── parser.py │ └── pipeline.py └── utils/ ├── __init__.py ├── converter.py ├── event_loop.py ├── helpers.py ├── media_processing_unit.py └── ph.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ .env .venv data downloads logs ================================================ FILE: .github/workflows/docker-image.yml ================================================ name: Docker Image CI on: release: types: [ published ] jobs: build: runs-on: ubuntu-latest permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: 构建&推送镜像 run: | # 获取release标签版本 VERSION=${GITHUB_REF#refs/tags/} # 构建并推送带版本号的镜像 docker build . --file Dockerfile \ --tag ghcr.io/z-mio/parse_hub_bot:${VERSION} \ --tag ghcr.io/z-mio/parse_hub_bot:latest docker push ghcr.io/z-mio/parse_hub_bot:${VERSION} docker push ghcr.io/z-mio/parse_hub_bot:latest ================================================ FILE: .gitignore ================================================ /.venv /logs /.idea /downloads .env *.session /data ================================================ FILE: Dockerfile ================================================ FROM python:3.12-slim AS build COPY --from=ghcr.io/astral-sh/uv:0.10.11 /uv /uvx /bin/ WORKDIR /app ENV UV_COMPILE_BYTECODE=1 \ UV_LINK_MODE=copy COPY pyproject.toml uv.lock ./ RUN apt-get update && apt-get install -y --no-install-recommends \ gcc python3-dev \ && rm -rf /var/lib/apt/lists/* RUN --mount=type=cache,target=/root/.cache/uv \ uv sync --no-install-project --frozen COPY . . RUN --mount=type=cache,target=/root/.cache/uv \ uv sync --frozen FROM python:3.12-slim AS runtime RUN apt-get update && apt-get install -y --no-install-recommends \ libglib2.0-0 \ ffmpeg \ media-types \ curl unzip ca-certificates \ && curl -fsSL https://deno.land/install.sh | sh \ && rm -rf /var/lib/apt/lists/* ENV DENO_INSTALL="/root/.deno" ENV PATH="/app/.venv/bin:$DENO_INSTALL/bin:$PATH" WORKDIR /app COPY --from=build /app /app ENV PATH="/app/.venv/bin:$PATH" CMD ["python", "bot.py"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 梓澪 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================
{text}" else: return text def progress(current: int, total: int, unit: str) -> str | None: if unit == "bytes": if total <= 0: return None text = f"下 载 中... | {current * 100 / total:.0f}%" if round(current * 100 / total, 1) % 25 == 0: return text else: text = f"下 载 中... | {current}/{total}" if (current + 1) % 3 == 0 or (current + 1) == total: return text return None async def create_telegraph_page(html_content: str, cli: Client, parse_result: AnyParseResult) -> str: """创建 Telegraph 页面,返回页面 URL""" logger.debug(f"创建 Telegraph 页面: title={parse_result.title}") me = await cli.get_me() page = await Telegraph().create_page( parse_result.title or "无标题", html_content=html_content, author_name=me.full_name, author_url=parse_result.raw_url, ) logger.debug(f"Telegraph 页面已创建: {page.url}") return page.url async def create_richtext_telegraph(cli: Client, parse_result: RichTextParseResult) -> str: """将富文本解析结果转换为 Telegraph 页面,返回页面 URL""" logger.debug(f"富文本转 Telegraph: platform={parse_result.platform}, md_len={len(parse_result.markdown_content)}") md = parse_result.markdown_content match parse_result.platform: case Platform.WEIXIN: md = md.replace("mmbiz.qpic.cn", "qpic.cn.in/mmbiz.qpic.cn") case Platform.COOLAPK: md = md.replace("image.coolapk.com", "qpic.cn.in/image.coolapk.com") html = clean_article_html(markdown(md)) return await create_telegraph_page(html, cli, parse_result) async def process_media_files(download_result: DownloadResult) -> list[ProcessedMedia]: """对下载结果中的媒体文件进行格式转换,返回 ProcessedMedia 列表""" processed_dir = download_result.output_dir.joinpath("processed") processor = MediaProcessingUnit(processed_dir, segment_height=1920, logger=logger.bind(name="MediaProcessor").debug) media_files = to_list(download_result.media) logger.debug(f"开始媒体格式转换: 文件数={len(media_files)}, output_dir={processed_dir}") processed_list: list[ProcessedMedia] = [] for media_file in media_files: # 对于实况图片只处理图片, 不处理视频 logger.debug(f"处理文件: {media_file.path}") result = await processor.process(media_file.path) logger.debug(f"处理结果: output_paths={result.output_paths}") processed_list.append(ProcessedMedia(media_file, result.output_paths, result.temp_dir)) logger.debug(f"媒体格式转换完成: 处理数={len(processed_list)}") return processed_list def get_supported_platforms() -> str: text: list[str] = [] for i in ParseHub().get_platforms(): text.append(f"**{i['name']}** __({'__, __'.join(i['supported_types'])})__") text.sort(reverse=True) return "\n".join(text) def build_start_text() -> str: return ( f"**发送分享链接以进行解析**\n\n" f"**支持的平台:**\n" f"
{get_supported_platforms()}\n\n" f"**命令列表:**\n" f"`/jx <链接>` - 解析并发送媒体\n" f"`/raw <链接>` - 不处理媒体, 发送原始文件\n" f"`/zip <链接>` - 不处理媒体, 保存解析结果, 发送压缩包\n\n" f"**开源地址: [GitHub](https://github.com/z-mio/parse_hub_bot)**" ) ================================================ FILE: plugins/inline_parse.py ================================================ import asyncio from parsehub import AnyParseResult from parsehub.types import ( AniRef, ImageRef, PostType, VideoRef, ) from pyrogram import Client from pyrogram.errors import FloodWait from pyrogram.types import ( ChosenInlineResult, InlineQuery, InlineQueryResult, InlineQueryResultAnimation, InlineQueryResultArticle, InlineQueryResultCachedAnimation, InlineQueryResultCachedDocument, InlineQueryResultCachedPhoto, InlineQueryResultCachedVideo, InlineQueryResultPhoto, InlineQueryResultVideo, InputMediaVideo, InputTextMessageContent, LinkPreviewOptions, ) from pyrogram.types import ( InlineKeyboardButton as Ikb, ) from pyrogram.types import ( InlineKeyboardMarkup as Ikm, ) from log import logger from plugins.filters import platform_filter from plugins.helpers import ( build_caption, build_caption_by_str, build_start_text, create_richtext_telegraph, resolve_media_info, ) from services import ParseService from services.cache import CacheEntry, CacheMediaType, parse_cache, persistent_cache from services.pipeline import ParsePipeline, StatusReporter from utils.helpers import to_list, with_request_id logger = logger.bind(name="InlineParse") DEFAULT_THUMB_URL = "https://telegra.ph/file/cdfdb65b83a4b7b2b6078.png" class InlineStatusReporter(StatusReporter): """基于 inline_message_id 的状态报告器""" def __init__(self, cli: Client, inline_message_id: str, caption: str = ""): self._cli = cli self._mid = inline_message_id self._caption = caption self._last_text: str | None = None async def report(self, text: str) -> None: text = f"**▎{text}**" full = f"{self._caption}\n{text}" if self._caption else text if full == self._last_text: return self._last_text = full try: await self._cli.edit_inline_text(self._mid, full) except FloodWait: pass async def report_error(self, stage: str, error: Exception) -> None: await self._cli.edit_inline_text( self._mid, f"**▎{stage}错误:** \n```\n{error}```", link_preview_options=LinkPreviewOptions(is_disabled=True), ) async def fn() -> None: await asyncio.sleep(15) await self._cli.edit_inline_text( self._mid, self._caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ) loop = asyncio.get_running_loop() loop.create_task(fn()) async def dismiss(self) -> None: pass def build_cached_inline_results(entry: CacheEntry, raw_url: str) -> list[InlineQueryResult]: """有 file_id 缓存时,构建 cached 类型的 inline 结果(Telegram 服务端直发)""" if entry.parse_result is None: return [] content = entry.parse_result.content caption = build_caption_by_str(entry.parse_result.title, content, raw_url, entry.telegraph_url) title = entry.parse_result.title or "无标题" # 富文本 if entry.telegraph_url: return [ InlineQueryResultArticle( title=title, input_message_content=InputTextMessageContent( caption, link_preview_options=LinkPreviewOptions(show_above_text=True), ), ) ] results: list[InlineQueryResult] = [] if not entry.media: results.append( InlineQueryResultArticle( title=title, description=content, input_message_content=InputTextMessageContent( caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ), ) ) return results for m in entry.media: match m.type: case CacheMediaType.PHOTO: results.append( InlineQueryResultCachedPhoto( photo_file_id=m.file_id, title=title, caption=caption, description=content, ) ) case CacheMediaType.VIDEO: results.append( InlineQueryResultCachedVideo( video_file_id=m.file_id, title=title, caption=caption, description=content, ) ) case CacheMediaType.ANIMATION: results.append( InlineQueryResultCachedAnimation( animation_file_id=m.file_id, title=title, caption=caption, ) ) case CacheMediaType.DOCUMENT: results.append( InlineQueryResultCachedDocument( document_file_id=m.file_id, title=title, caption=caption, description=content, ) ) return results async def build_inline_results(parse_result: AnyParseResult, cli: Client) -> list[InlineQueryResult]: """根据解析结果构建内联查询结果列表""" logger.debug(f"构建 inline 结果: type={parse_result.type}, title={parse_result.title}") title = parse_result.title or "无标题" media_list = to_list(parse_result.media) reply_markup = Ikm([[Ikb("原链接", url=parse_result.raw_url)]]) results: list[InlineQueryResult] = [] # ── 富文本直接 telegraph 发送 ── if parse_result.type == PostType.RICHTEXT: url = await create_richtext_telegraph(cli, parse_result) caption = build_caption(parse_result, url) results.append( InlineQueryResultArticle( title=title, description=parse_result.content, input_message_content=InputTextMessageContent( caption, link_preview_options=LinkPreviewOptions(show_above_text=True), ), ) ) return results caption = build_caption(parse_result) if not media_list: results.append( InlineQueryResultArticle( title=title, description=parse_result.content, input_message_content=InputTextMessageContent( caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ), ) ) return results for index, media_ref in enumerate(media_list): if isinstance(media_ref, ImageRef): results.append( InlineQueryResultPhoto( media_ref.url, thumb_url=media_ref.thumb_url, photo_width=media_ref.width, photo_height=media_ref.height, caption=caption, title=title, description=parse_result.content, ) ) elif isinstance(media_ref, VideoRef): results.append( InlineQueryResultPhoto( media_ref.thumb_url or DEFAULT_THUMB_URL, photo_width=media_ref.width, photo_height=media_ref.height, id=f"download_{index}", title=caption, caption=caption, reply_markup=reply_markup, ) ) elif isinstance(media_ref, AniRef): if media_ref.ext != "gif": results.append( InlineQueryResultVideo( media_ref.url, media_ref.thumb_url or DEFAULT_THUMB_URL, caption=caption, title=title, description=parse_result.content, ) ) else: results.append( InlineQueryResultAnimation( media_ref.url, thumb_url=media_ref.thumb_url, caption=caption, title=title, description=parse_result.content, ) ) logger.debug(f"inline 结果构建完成: count={len(results)}") return results @Client.on_inline_query(~platform_filter) async def inline_parse_tip(_: Client, inline_query: InlineQuery) -> None: results: list[InlineQueryResult] = [ InlineQueryResultArticle( title="聚合解析", description="请在聊天框输入链接", input_message_content=InputTextMessageContent( build_start_text(), link_preview_options=LinkPreviewOptions(is_disabled=True) ), thumb_url="https://i.imgloc.com/2023/06/15/Vbfazk.png", ) ] await inline_query.answer(results=results, cache_time=1) @Client.on_inline_query(platform_filter) @with_request_id async def call_inline_parse(cli: Client, inline_query: InlineQuery) -> None: logger.info(f"收到内联解析请求: query={inline_query.query}, from_user={inline_query.from_user.id}") raw_url = await ParseService().get_raw_url(inline_query.query) if cached := await persistent_cache.get(raw_url): logger.debug("inline: 缓存命中, 构建 cached 结果") results = build_cached_inline_results(cached, raw_url) await inline_query.answer(results[:50], cache_time=60) return parse_result = await parse_cache.get(raw_url) if parse_result is None: parse_result = await ParseService().parse(inline_query.query) await parse_cache.set(raw_url, parse_result) results = await build_inline_results(parse_result, cli) logger.debug(f"inline 查询完成, 返回 {len(results)} 个结果") await inline_query.answer(results[:50], cache_time=0) @Client.on_chosen_inline_result() @with_request_id async def inline_result_download(cli: Client, chosen_result: ChosenInlineResult) -> None: if not chosen_result.result_id.startswith("download_"): return media_index = int(chosen_result.result_id.split("_")[1]) inline_message_id = chosen_result.inline_message_id if inline_message_id is None: return query = chosen_result.query logger.debug(f"inline 下载触发: media_index={media_index}, query={query}") raw_url = await ParseService().get_raw_url(query) cached_result = await parse_cache.get(raw_url) logger.debug(f"缓存命中: {cached_result is not None}") caption = build_caption(cached_result) if cached_result else "" reporter = InlineStatusReporter(cli, inline_message_id, caption) pipeline = ParsePipeline(query, reporter, parse_result=cached_result, singleflight=False) if (result := await pipeline.run()) is None: return parse_result = result.parse_result caption = build_caption(parse_result) # ── 上传 ── await reporter.report("上 传 中...") processed = result.processed_list[media_index] video_ref = parse_result.media[media_index] if isinstance(parse_result.media, list) else parse_result.media try: file_paths = processed.output_paths or [processed.source.path] file_path_str = str(file_paths[0]) logger.debug(f"inline 上传文件: {file_path_str}") width, height, duration = resolve_media_info(processed, file_path_str) video_cover = str(video_ref.thumb_url) if video_ref and video_ref.thumb_url else None media = ( InputMediaVideo( file_path_str, caption=caption, video_cover=video_cover, duration=duration or 0, width=width or 0, height=height or 0, supports_streaming=True, ) if video_cover else InputMediaVideo( file_path_str, caption=caption, duration=duration or 0, width=width or 0, height=height or 0, supports_streaming=True, ) ) await cli.edit_inline_media(inline_message_id, media=media) except Exception as e: logger.opt(exception=e).debug("详细堆栈") logger.error(f"inline 上传失败: {e}") await reporter.report_error("上传", e) finally: logger.debug("inline 下载任务完成") result.cleanup() ================================================ FILE: plugins/parse.py ================================================ import asyncio import os from collections.abc import Awaitable, Callable from itertools import batched from typing import Any, Literal from parsehub.types import ( AniFile, AnyMediaRef, AnyParseResult, ImageFile, LivePhotoFile, PostType, VideoFile, ) from pyrogram import Client, enums, filters from pyrogram.errors import FloodWait, SlowmodeWait from pyrogram.types import ( InputMediaAnimation, InputMediaDocument, InputMediaPhoto, InputMediaVideo, LinkPreviewOptions, Message, ) from core import bs from log import logger from plugins.filters import platform_filter from plugins.helpers import ( ProcessedMedia, build_caption, build_caption_by_str, create_richtext_telegraph, resolve_media_info, ) from services import ParseService from services.cache import CacheEntry, CacheMedia, CacheMediaType, CacheParseResult, parse_cache, persistent_cache from services.pipeline import ParsePipeline, PipelineResult, StatusReporter from utils.helpers import pack_dir_to_tar_gz, to_list, with_request_id logger = logger.bind(name="Parse") SKIP_DOWNLOAD_THRESHOLD = 0 MAX_RETRIES = 5 async def _send_with_rate_limit[T]( send_coro_fn: Callable[[], Awaitable[T]], ) -> T: """带自动重试的发送包装器。 Args: send_coro_fn: 返回协程的可调用对象(lambda 或函数),每次重试会重新调用 """ for attempt in range(MAX_RETRIES): try: return await send_coro_fn() except (FloodWait, SlowmodeWait) as e: if attempt < MAX_RETRIES - 1: logger.warning(f"{e.ID} 重试 ({attempt + 1}/{MAX_RETRIES}),等待 {e.value}s") await asyncio.sleep(e.value) else: raise e from e raise RuntimeError("发送重试失败") class MessageStatusReporter(StatusReporter): """基于 Telegram Message 的状态报告器""" def __init__(self, user_msg: Message): self._user_msg = user_msg self._msg: Message | None = None async def report(self, text: str) -> None: await self._edit_text(f"**▎{text}**") async def report_error(self, stage: str, error: Exception) -> None: await self._edit_text( f"**▎{stage}错误:** \n```\n{error}```", link_preview_options=LinkPreviewOptions(is_disabled=True), ) async def fn() -> None: await asyncio.sleep(15) if self._msg: await self._msg.delete() loop = asyncio.get_running_loop() loop.create_task(fn()) async def dismiss(self) -> None: if self._msg: await self._msg.delete() async def _edit_text(self, text: str, **kwargs: Any) -> None: try: if self._msg is None: self._msg = await self._user_msg.reply_text(text, **kwargs) else: if self._msg.text != text: await self._msg.edit_text(text, **kwargs) except (FloodWait, SlowmodeWait): pass # ── Handler ────────────────────────────────────────────────────────── @Client.on_message(filters.command(["jx", "raw", "zip"]) | ((filters.text | filters.caption) & platform_filter)) async def jx(cli: Client, msg: Message) -> None: mode = "preview" if msg.command: match msg.command[0]: case "raw": mode = "raw" case "jx": mode = "preview" case "zip": mode = "zip" text = " ".join(msg.command[1:]) if msg.command[1:] else "" if not text and msg.reply_to_message: text = msg.reply_to_message.text or msg.reply_to_message.caption or "" if not text: await msg.reply_text("**▎请加上链接或回复一条消息**") return else: text = msg.text or msg.caption or "" tokens = text.strip().split() urls = list({i for i in tokens if ParseService().parser.get_platform(i)})[:10] if not urls: await msg.reply_text("**▎不支持的平台**") return tasks = [handle_parse(cli, msg, url, mode) for url in urls] await asyncio.gather(*tasks) # ── 主流程 ─────────────────────────────────────────────────────────── @with_request_id async def handle_parse( cli: Client, msg: Message, url: str, mode: Literal["raw", "preview", "zip"] | str = "preview" ) -> None: chat_id = msg.chat.id if msg.chat else None logger.info(f"收到解析请求: url={url}, chat_id={chat_id}, msg_id={msg.id}, mode={mode}") reporter = MessageStatusReporter(msg) match mode: case "raw": use_caching = False skip_media_processing = True singleflight = False save_metadata = False case "zip": use_caching = False skip_media_processing = True singleflight = False save_metadata = True case _: use_caching = True skip_media_processing = False singleflight = True save_metadata = False try: raw_url = await ParseService().get_raw_url(url) except Exception as e: await reporter.report_error("获取原始链接", e) return if use_caching and (cached := await persistent_cache.get(raw_url)): logger.debug("file_id 缓存命中, 直接发送") await _send_cached(msg, cached, raw_url) return cached_parse_result = await parse_cache.get(raw_url) pipeline = ParsePipeline( url, reporter, parse_result=cached_parse_result, singleflight=singleflight, skip_media_processing=skip_media_processing, skip_download_threshold=SKIP_DOWNLOAD_THRESHOLD, save_metadata=save_metadata, ) if (result := await pipeline.run()) is None: if pipeline.waited: logger.debug("Singleflight 等待完成, 重新检查缓存") if cached := await persistent_cache.get(raw_url): await _send_cached(msg, cached, raw_url) else: await handle_parse(cli, msg, url, mode=mode) return else: logger.debug("Pipeline 返回 None, 跳过后续处理") return parse_result = result.parse_result await parse_cache.set(raw_url, parse_result) # ── 富文本 → Telegraph ── if parse_result.type == PostType.RICHTEXT: logger.debug(f"富文本类型, 创建 Telegraph 页面: title={parse_result.title}") try: await msg.reply_chat_action(enums.ChatAction.TYPING) ph_url = await create_richtext_telegraph(cli, parse_result) logger.debug(f"Telegraph 页面创建完成: {ph_url}") caption = build_caption(parse_result, ph_url) await msg.reply_text( caption, link_preview_options=LinkPreviewOptions(show_above_text=True), ) await persistent_cache.set( raw_url, CacheEntry( parse_result=CacheParseResult(title=parse_result.title, content=parse_result.content), telegraph_url=ph_url, ), ) await reporter.dismiss() return finally: pipeline.finish() caption = build_caption(parse_result) if not result.processed_list: logger.debug("无媒体文件, 仅发送文本") await msg.reply_chat_action(enums.ChatAction.TYPING) await msg.reply_text( caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ) cache_entry = CacheEntry(parse_result=CacheParseResult(title=parse_result.title, content=parse_result.content)) await persistent_cache.set(raw_url, cache_entry) await reporter.dismiss() pipeline.finish() return if mode == "raw": await _send_raw(msg, result, reporter) return if mode == "zip": await _send_zip(msg, result, reporter) return # ── 上传媒体 ── logger.debug(f"开始上传媒体: media_count={len(result.processed_list)}") await reporter.report("上 传 中...") try: media_cache_entry = await _send_media(msg, parse_result, result.processed_list, caption) if media_cache_entry: await persistent_cache.set(raw_url, media_cache_entry) await reporter.dismiss() except Exception as e: logger.opt(exception=e).debug("详细堆栈") logger.error(f"上传失败: {e}") await reporter.report_error("上传", e) return finally: result.cleanup() pipeline.finish() # ── 构建 InputMedia ────────────────────────────────────────────────── def _build_input_media( media_refs: list[AnyMediaRef], processed_list: list[ProcessedMedia], ) -> tuple[list[InputMediaPhoto | InputMediaVideo], list[InputMediaAnimation]]: """根据处理结果和媒体引用构建 Telegram InputMedia 列表。 Returns: (photos_videos, animations) 两类媒体列表 """ photos_videos: list[InputMediaPhoto | InputMediaVideo] = [] animations: list[InputMediaAnimation] = [] for media_ref, processed in zip(media_refs, processed_list, strict=False): file_paths = processed.output_paths or [processed.source.path] for file_path in file_paths: file_path_str = str(file_path) width, height, duration = resolve_media_info(processed, file_path_str) match processed.source: case ImageFile(): photos_videos.append(InputMediaPhoto(media=file_path_str)) case AniFile(): animations.append(InputMediaAnimation(media=file_path_str)) case VideoFile(): photos_videos.append( InputMediaVideo( media=file_path_str, video_cover=media_ref.thumb_url, duration=duration, width=width, height=height, supports_streaming=True, ) ) case LivePhotoFile(): photos_videos.append( InputMediaVideo( media=processed.source.video_path, video_cover=file_path_str, duration=duration, width=width, height=height, supports_streaming=True, ) ) return photos_videos, animations # ── 缓存条目构建 ───────────────────────────────────────────────────── def _cache_media_from_message(m: Message) -> CacheMedia | None: """从已发送的 Telegram Message 提取 CacheMedia。""" if m.photo: return CacheMedia(type=CacheMediaType.PHOTO, file_id=m.photo.file_id) if m.video: return CacheMedia( type=CacheMediaType.VIDEO, file_id=m.video.file_id, cover_file_id=m.video.video_cover.file_id if m.video.video_cover else None, ) if m.animation: return CacheMedia(type=CacheMediaType.ANIMATION, file_id=m.animation.file_id) if m.document: return CacheMedia(type=CacheMediaType.DOCUMENT, file_id=m.document.file_id) return None def _make_cache_entry(parse_result: AnyParseResult, media_list: list[CacheMedia]) -> CacheEntry: return CacheEntry( parse_result=CacheParseResult(title=parse_result.title, content=parse_result.content), media=media_list, ) # ── Raw 模式上传 ────────────────────────────────────────────────────── async def _send_raw( msg: Message, result: PipelineResult, reporter: MessageStatusReporter, ) -> None: """Raw 模式:将文件以原始文档形式上传。""" logger.debug("Raw 模式, 直接上传文件") await reporter.report("上 传 中...") try: caption = build_caption(result.parse_result) all_docs: list[InputMediaDocument] = [] livephoto_videos: dict[int, InputMediaDocument] = {} for idx, processed in enumerate(result.processed_list): file_paths = processed.output_paths or [processed.source.path] file_path = file_paths[0] all_docs.append(InputMediaDocument(media=str(file_path))) if isinstance(processed.source, LivePhotoFile): livephoto_videos[idx] = InputMediaDocument(media=str(processed.source.video_path)) if len(all_docs) == 1: await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) sent_msg = await _send_with_rate_limit( lambda: msg.reply_document(all_docs[0].media, caption=caption, force_document=True) ) if livephoto_videos and sent_msg: await _send_with_rate_limit( lambda: sent_msg.reply_document(livephoto_videos[0].media, force_document=True) ) else: msgs: list[Message] = [] for batch in batched(all_docs, 10): await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) # noinspection PyDefaultArgument mg = await _send_with_rate_limit(lambda b=list(batch): msg.reply_media_group(b)) # type: ignore msgs.extend(mg) if livephoto_videos: for idx, media_doc in livephoto_videos.items(): await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) await _send_with_rate_limit( lambda m_=media_doc, idx_=idx: msgs[idx_].reply_document(m_.media, force_document=True) # type: ignore[misc] ) await _send_with_rate_limit( lambda: msg.reply_text( caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ) ) except Exception as e: logger.opt(exception=e).debug("详细堆栈") logger.error(f"Raw 模式上传失败: {e}") await reporter.report_error("上传", e) return finally: result.cleanup() await reporter.dismiss() async def _send_zip( msg: Message, result: PipelineResult, reporter: MessageStatusReporter, ) -> None: logger.debug("Zip 模式, 开始打包") await reporter.report("打 包 中...") try: caption = build_caption(result.parse_result) if result.output_dir is None: raise ValueError("缺少打包目录") pack_path = await asyncio.to_thread(pack_dir_to_tar_gz, result.output_dir) except Exception as e: logger.opt(exception=e).debug("详细堆栈") logger.error(f"打包失败: {e}") await reporter.report_error("打包", Exception("...")) return finally: result.cleanup() await reporter.report("上 传 中...") try: await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) await _send_with_rate_limit(lambda: msg.reply_document(str(pack_path), caption=caption)) except Exception as e: logger.opt(exception=e).debug("详细堆栈") logger.error(f"上传失败: {e}") await reporter.report_error("上传", e) return finally: if not bs.debug_skip_cleanup: logger.debug("清理压缩包") os.remove(pack_path) await reporter.dismiss() # ── 发送媒体 ───────────────────────────────────────────────────────── async def _send_single( msg: Message, photos_videos: list[InputMediaPhoto | InputMediaVideo], animations: list[InputMediaAnimation], caption: str, ) -> list[CacheMedia] | None: """发送单个媒体,返回 CacheMedia 列表。上传失败时降级为 document。 返回 None 表示不缓存 """ media_list: list[CacheMedia] = [] all_media = animations + photos_videos try: sent: Message | None = None if animations: await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) sent = await _send_with_rate_limit(lambda: msg.reply_animation(animations[0].media, caption=caption)) else: single = photos_videos[0] match single: case InputMediaPhoto(): await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) sent = await _send_with_rate_limit(lambda: msg.reply_photo(single.media, caption=caption)) case InputMediaVideo(): await msg.reply_chat_action(enums.ChatAction.UPLOAD_VIDEO) sent = await _send_with_rate_limit( lambda: msg.reply_video( single.media, caption=caption, video_cover=single.video_cover, duration=single.duration, width=single.width, height=single.height, supports_streaming=True, ) ) if sent and (cm := _cache_media_from_message(sent)): media_list.append(cm) except Exception as e: logger.warning(f"上传失败 {e}, 使用兼容模式上传") await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) await _send_with_rate_limit( lambda: msg.reply_document(all_media[0].media, caption=caption, force_document=True) ) return None return media_list async def _send_multi( msg: Message, photos_videos: list[InputMediaPhoto | InputMediaVideo], animations: list[InputMediaAnimation], caption: str, ) -> list[CacheMedia] | None: """发送多个媒体(动图逐条、图片视频分批),返回 CacheMedia 列表。 返回 None 表示不缓存 """ media_list: list[CacheMedia] = [] not_cache = False for ani in animations: await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) caption_ = caption if ani == animations[-1] and not photos_videos else "" try: sent = await _send_with_rate_limit( lambda a=ani, c=caption_: msg.reply_animation( # type: ignore[misc] a.media, caption=c, ) ) except Exception as e: logger.warning(f"上传失败 {e}, 使用兼容模式上传") not_cache = True await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) await _send_with_rate_limit( lambda a=ani, c=caption_: msg.reply_document(a.media, caption=c, force_document=True) # type: ignore[misc] ) else: # 过大的 GIF 会返回 document if sent and sent.document: media_list.append(CacheMedia(type=CacheMediaType.DOCUMENT, file_id=sent.document.file_id)) elif sent and sent.animation: media_list.append(CacheMedia(type=CacheMediaType.ANIMATION, file_id=sent.animation.file_id)) try: for batch in batched(photos_videos, 10): if batch[-1] == photos_videos[-1]: batch[0].caption = caption await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) # noinspection PyDefaultArgument sent_msgs = await _send_with_rate_limit(lambda b=list(batch): msg.reply_media_group(media=b)) # type: ignore[misc] for m in sent_msgs: if cm := _cache_media_from_message(m): media_list.append(cm) except Exception as e: logger.warning(f"上传失败 {e}, 使用兼容模式上传") input_documents: list[InputMediaDocument] = [InputMediaDocument(media=item.media) for item in photos_videos] for document_batch in batched(input_documents, 10): if document_batch[-1] == input_documents[-1]: document_batch[0].caption = caption await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) # noinspection PyDefaultArgument await _send_with_rate_limit(lambda b=list(document_batch): msg.reply_media_group(media=b)) # type: ignore return None return None if not_cache else media_list async def _send_media( msg: Message, parse_result: AnyParseResult, processed_list: list[ProcessedMedia], caption: str ) -> CacheEntry | None: """构建、发送媒体,并返回缓存条目。 返回 None 表示不缓存 """ media_refs: list[AnyMediaRef] = to_list(parse_result.media) photos_videos, animations = _build_input_media(media_refs, processed_list) all_count = len(photos_videos) + len(animations) logger.debug(f"媒体分类完成: animations={len(animations)}, photos_videos={len(photos_videos)}") if all_count == 1: logger.debug("单媒体模式发送") media_list = await _send_single(msg, photos_videos, animations, caption) else: logger.debug(f"多媒体模式发送: total={all_count}") media_list = await _send_multi(msg, photos_videos, animations, caption) if media_list is None: return None return _make_cache_entry(parse_result, media_list) # ── 缓存发送 ───────────────────────────────────────────────────────── async def _send_cached(msg: Message, entry: CacheEntry, url: str) -> None: """从 file_id 缓存直接发送,跳过解析/下载/转码""" logger.debug(f"缓存发送: media={entry.media}") if entry.parse_result is None: await persistent_cache.remove(url) return caption = build_caption_by_str(entry.parse_result.title, entry.parse_result.content, url, entry.telegraph_url) # 富文本类型 if entry.telegraph_url: await msg.reply_text( caption, link_preview_options=LinkPreviewOptions(show_above_text=True), ) return if not entry.media: await msg.reply_text( caption, link_preview_options=LinkPreviewOptions(is_disabled=True), ) return if len(entry.media) == 1: await _send_cached_single(msg, entry.media[0], caption) else: await _send_cached_multi(msg, entry.media, caption) async def _send_cached_single(msg: Message, m: CacheMedia, caption: str) -> None: """从缓存发送单个媒体。""" match m.type: case CacheMediaType.PHOTO: await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) await _send_with_rate_limit(lambda: msg.reply_photo(m.file_id, caption=caption)) case CacheMediaType.VIDEO: await msg.reply_chat_action(enums.ChatAction.UPLOAD_VIDEO) await _send_with_rate_limit( lambda: msg.reply_video( m.file_id, caption=caption, supports_streaming=True, video_cover=m.cover_file_id ) ) case CacheMediaType.ANIMATION: await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) await _send_with_rate_limit(lambda: msg.reply_animation(m.file_id, caption=caption)) case CacheMediaType.DOCUMENT: await msg.reply_chat_action(enums.ChatAction.UPLOAD_DOCUMENT) await _send_with_rate_limit(lambda: msg.reply_document(m.file_id, caption=caption, force_document=True)) async def _send_cached_multi(msg: Message, media: list[CacheMedia], caption: str) -> None: """从缓存发送多个媒体。""" animations = [m for m in media if m.type == CacheMediaType.ANIMATION] others = [m for m in media if m.type != CacheMediaType.ANIMATION] for ani in animations: await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) await _send_with_rate_limit( lambda a=ani: msg.reply_animation( # type: ignore[misc] a.file_id, caption=caption if a == animations[-1] and not others else "", ) ) media_group = _build_cached_media_group(others) for batch in batched(media_group, 10): if batch[-1] == media_group[-1]: batch[0].caption = caption await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) # noinspection PyDefaultArgument await _send_with_rate_limit(lambda m=list(batch): msg.reply_media_group(m)) # type: ignore[misc] def _build_cached_media_group( media: list[CacheMedia], ) -> list[InputMediaPhoto | InputMediaVideo | InputMediaDocument]: """从 CacheMedia 列表构建 Telegram media group。""" group: list[InputMediaPhoto | InputMediaVideo | InputMediaDocument] = [] for m in media: match m.type: case CacheMediaType.PHOTO: group.append(InputMediaPhoto(media=m.file_id)) case CacheMediaType.VIDEO: if m.cover_file_id: group.append(InputMediaVideo(media=m.file_id, supports_streaming=True, video_cover=m.cover_file_id)) else: group.append(InputMediaVideo(media=m.file_id, supports_streaming=True)) case CacheMediaType.DOCUMENT: group.append(InputMediaDocument(media=m.file_id)) return group ================================================ FILE: plugins/start.py ================================================ from pyrogram import Client, filters from pyrogram.types import LinkPreviewOptions, Message from plugins.helpers import build_start_text @Client.on_message(filters.command(["start", "help"])) async def start(_: Client, msg: Message) -> None: await msg.reply( build_start_text(), link_preview_options=LinkPreviewOptions(is_disabled=True), ) ================================================ FILE: pyproject.toml ================================================ [project] name = "parsehubbot" version = "0.1.0" description = "Add your description here" readme = "README.md" requires-python = ">=3.12" dependencies = [ "haishoku>=1.1.8", "httpx>=0.28.1", "kurigram>=2.2.7", "loguru>=0.6.0", "lxml-html-clean>=0.4.1", "markdown>=3.7", "parsehub>=2.0.17", "pickledb>=1.6", "pillow>=12.1.1", "pillow-heif>=1.1.1", "pydantic>=2.12.5", "pydantic-settings>=2.11.0", "python-dotenv>=1.0.1", "pyyaml>=6.0.3", "telegraph>=2.2.0", "tgcrypto>=1.2.5", "uvloop>=0.22.1 ; sys_platform != 'win32'", "winloop>=0.3.1 ; sys_platform == 'win32'", ] [tool.ruff] line-length = 120 [tool.ruff.lint] select = [ "E", # pycodestyle 错误检查 "W", # pycodestyle 警告检查 "F", # pyflakes 错误检查 "I", # isort 导入排序 "B", # flake8-bugbear 常见错误检查 "C4", # flake8-comprehensions 列表/字典推导式检查 "UP", # pyupgrade 自动升级语法 ] ignore = [ "B008", # 不在参数默认值中执行函数调用 "C901", # 函数复杂度过高 ] [dependency-groups] dev = [ "mypy>=2.1.0", ] [tool.mypy] python_version = "3.12" files = ["./"] ignore_missing_imports = true warn_return_any = true warn_unused_ignores = true check_untyped_defs = true disallow_untyped_defs = true no_implicit_optional = true ================================================ FILE: services/__init__.py ================================================ from .cache import CacheEntry, CacheMedia, CacheMediaType, CacheParseResult, parse_cache, persistent_cache from .parser import ParseService from .pipeline import ParsePipeline, PipelineProgressCallback, PipelineResult, StatusReporter __all__ = [ "ParseService", "parse_cache", "persistent_cache", "CacheEntry", "CacheMedia", "CacheMediaType", "CacheParseResult", "ParsePipeline", "PipelineResult", "PipelineProgressCallback", "StatusReporter", ] ================================================ FILE: services/cache.py ================================================ import asyncio import time from enum import StrEnum from typing import Any from pickledb import PickleDB from pydantic import BaseModel from core import bs from log import logger class TTLCache: def __init__(self, ttl: float = 300, cleanup_interval: float = 60, maxsize: int = 0): self._ttl = ttl self._store: dict[str, tuple[Any, float]] = {} self._lock = asyncio.Lock() self.logger = logger.bind(name="TTLCache") self._cleanup_interval = cleanup_interval self._cleanup_task: asyncio.Task | None = None self._maxsize = maxsize async def get(self, key: str) -> Any | None: async with self._lock: entry = self._store.get(key) if entry is None: self.logger.debug(f"缓存未命中: key={key}") return None value, expire_at = entry if time.monotonic() > expire_at: self.logger.debug(f"缓存已过期: key={key}") del self._store[key] return None self.logger.debug(f"缓存命中: key={key}") return value async def set(self, key: str, value: Any, ttl: float | None = None) -> None: async with self._lock: effective_ttl = ttl or self._ttl self.logger.debug(f"缓存写入: key={key}, ttl={effective_ttl}s") if key in self._store: del self._store[key] self._store[key] = (value, time.monotonic() + effective_ttl) await self._evict_overflow_locked() async def _evict_overflow_locked(self) -> None: if self._maxsize <= 0: return overflow = len(self._store) - self._maxsize if overflow <= 0: return for key in list(self._store)[:overflow]: del self._store[key] self.logger.debug(f"缓存数量超限, 淘汰最旧缓存: {overflow} 条") async def pop(self, key: str) -> Any | None: async with self._lock: entry = self._store.pop(key, None) if entry is None: self.logger.debug(f"缓存 pop 未命中: key={key}") return None value, expire_at = entry if time.monotonic() > expire_at: self.logger.debug(f"缓存 pop 已过期: key={key}") return None self.logger.debug(f"缓存 pop 命中: key={key}") return value def start_cleanup(self) -> None: """启动后台清理任务(需在事件循环运行后调用)""" if self._cleanup_task is None: self._cleanup_task = asyncio.create_task(self._periodic_cleanup()) self.logger.debug(f"后台清理任务已启动, interval={self._cleanup_interval}s") async def _periodic_cleanup(self) -> None: while True: await asyncio.sleep(self._cleanup_interval) async with self._lock: now = time.monotonic() expired_keys = [k for k, (_, exp) in self._store.items() if now > exp] for k in expired_keys: del self._store[k] if expired_keys: self.logger.debug(f"定时清理过期缓存: {len(expired_keys)} 条") class CacheMediaType(StrEnum): PHOTO = "photo" VIDEO = "video" ANIMATION = "animation" DOCUMENT = "document" class CacheParseResult(BaseModel): title: str = "" content: str = "" class CacheMedia(BaseModel): type: CacheMediaType file_id: str cover_file_id: str | None = None class CacheEntry(BaseModel): parse_result: CacheParseResult | None = None media: list[CacheMedia] | None = None telegraph_url: str | None = None class _StorageWrapper(BaseModel): entry: CacheEntry exp: int = 0 class PersistentCache: def __init__( self, db_path: str, ttl: int, save_interval: float = 5 * 60, cleanup_interval: float = 60 * 60, max_entries: int = 30000, ): self._db = PickleDB(db_path) self._ttl = ttl self.logger = logger.bind(name="PersistentCache") self.logger.debug(f"缓存已初始化: {db_path}") self._save_interval = save_interval self._cleanup_interval = cleanup_interval self._max_entries = max_entries self._cleanup_task: asyncio.Task | None = None self._lock = asyncio.Lock() self._loaded = False self._dirty = False self._last_cleanup_at = 0.0 @property def enabled(self) -> bool: return self._ttl > 0 async def _ensure_loaded_locked(self) -> None: if self._loaded: return await self._db.load() self._loaded = True self._last_cleanup_at = time.monotonic() removed = await self._evict_overflow_locked() if removed: self._dirty = True self.logger.debug(f"缓存已加载: {self._db.location}, evicted={removed}") async def _save_locked(self) -> None: if not self._loaded or not self._dirty: return await self._db.save() self._dirty = False self.logger.debug("缓存已保存") async def get(self, url: str) -> CacheEntry | None: if not self.enabled: return None async with self._lock: await self._ensure_loaded_locked() data = await self._db.get(url) if data is None: return None if data.get("exp", 0) <= time.time(): self.logger.debug(f"缓存过期: key={url}") if await self._db.remove(url): self._dirty = True return None self.logger.debug(f"缓存命中: key={url}") return _StorageWrapper.model_validate(data).entry async def set(self, url: str, entry: CacheEntry) -> None: if not self.enabled: return sw = _StorageWrapper(entry=entry, exp=int(time.time() + self._ttl)) async with self._lock: await self._ensure_loaded_locked() await self._db.remove(url) await self._db.set(url, sw.model_dump()) removed = await self._evict_overflow_locked() self._dirty = True self.logger.debug(f"缓存写入: key={url}, evicted={removed}") async def remove(self, url: str) -> None: if not self.enabled: return async with self._lock: await self._ensure_loaded_locked() if await self._db.remove(url): self._dirty = True def start_cleanup(self) -> None: """启动后台清理任务""" if not self.enabled: self.logger.debug("持久缓存已禁用, 跳过后台任务") return if self._cleanup_task is None: self._cleanup_task = asyncio.create_task(self._periodic_cleanup()) self.logger.debug( f"后台缓存任务已启动, save_interval={self._save_interval}s, cleanup_interval={self._cleanup_interval}s" ) async def close(self) -> None: if self._cleanup_task: self._cleanup_task.cancel() try: await self._cleanup_task except asyncio.CancelledError: pass self._cleanup_task = None if not self.enabled: return async with self._lock: await self._save_locked() async def _periodic_cleanup(self) -> None: while True: await asyncio.sleep(self._save_interval) if not self._loaded: continue async with self._lock: now = time.monotonic() if now - self._last_cleanup_at >= self._cleanup_interval: expired = await self._remove_expired_locked() overflow = await self._evict_overflow_locked() if expired or overflow: self._dirty = True self.logger.debug(f"定时清理缓存: expired={expired}, overflow={overflow}") self._last_cleanup_at = now await self._save_locked() async def _remove_expired_locked(self) -> int: now = time.time() removed = 0 all_keys = await self._db.all() for key in all_keys: data = await self._db.get(key) if data and data.get("exp", 0) <= now: await self._db.remove(key) removed += 1 return removed async def _evict_overflow_locked(self) -> int: if self._max_entries <= 0: return 0 keys = await self._db.all() overflow = len(keys) - self._max_entries if overflow <= 0: return 0 for key in keys[:overflow]: await self._db.remove(key) return overflow parse_cache = TTLCache(ttl=30 * 60, maxsize=1000) # 解析结果缓存 30 分钟 persistent_cache = PersistentCache( str(bs.cache_path / "cache.json"), ttl=bs.cache_time * 60, save_interval=bs.cache_save_interval * 60, cleanup_interval=bs.cache_cleanup_interval * 60, max_entries=bs.cache_max_entries, ) ================================================ FILE: services/parser.py ================================================ from typing import Self from parsehub import ParseHub, Platform from parsehub.types import ( AnyParseResult, ) from core import pl_cfg from log import logger logger = logger.bind(name="ParseService") class ParseService: _instance: Self | None = None def __new__(cls) -> Self: if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self) -> None: self.parser = ParseHub() def get_platform(self, url: str) -> Platform: p = self.parser.get_platform(url) if not p: raise ValueError("不支持的平台") return p async def parse(self, url: str) -> AnyParseResult: logger.debug(f"开始解析 {url}") p = self.get_platform(url) max_retries = 3 for attempt in range(1, max_retries + 1): try: cookie = pl_cfg.roll_cookie(p.id) proxy = pl_cfg.roll_parser_proxy(p.id) logger.debug(f"使用配置: proxy={proxy}, cookie={cookie}, attempt={attempt}/{max_retries}") pr = await self.parser.parse(url, cookie=cookie, proxy=proxy) logger.debug(f"解析完成: {pr}") return pr except Exception as e: logger.warning(f"解析失败, attempt={attempt}/{max_retries}, err={e}") if attempt >= max_retries: raise Exception(e) from e raise async def get_raw_url(self, url: str, clean_all: bool = True) -> str: p = self.get_platform(url) max_retries = 3 for attempt in range(1, max_retries + 1): try: proxy = pl_cfg.roll_parser_proxy(p.id) logger.debug(f"使用配置: proxy={proxy}, attempt={attempt}/{max_retries}") raw_url = await self.parser.get_raw_url(url, proxy=proxy, clean_all=clean_all) logger.debug(f"原始 URL: {raw_url}") return str(raw_url) except Exception as e: logger.warning(f"获取原始 URL 失败, attempt={attempt}/{max_retries}, err={e}") if attempt >= max_retries: raise Exception(e) from e raise ================================================ FILE: services/pipeline.py ================================================ import asyncio import shutil from collections.abc import Awaitable, Callable from dataclasses import dataclass, field from pathlib import Path from typing import Any, Protocol from parsehub import DownloadResult from parsehub.types import AnyParseResult, PostType, ProgressUnit from core import bs, pl_cfg from log import logger from plugins.helpers import ProcessedMedia, process_media_files from services import ParseService from utils.helpers import to_list logger = logger.bind(name="Pipeline") _inflight: dict[str, asyncio.Event] = {} class StatusReporter(Protocol): """抽象状态通知,由调用方实现""" async def report(self, text: str) -> None: ... async def report_error(self, stage: str, error: Exception) -> None: ... async def dismiss(self) -> None: ... @dataclass class PipelineResult: parse_result: AnyParseResult processed_list: list[ProcessedMedia] = field(default_factory=list) output_dir: Path | None = None def cleanup(self) -> None: if bs.debug_skip_cleanup: logger.debug("debug_skip_cleanup=True 跳过清理") return if self.output_dir: logger.debug("清理资源") shutil.rmtree(self.output_dir, ignore_errors=True) class PipelineProgressCallback: """统一的下载进度回调,依赖 StatusReporter""" def __init__(self, reporter: StatusReporter): self._reporter = reporter self._last_text: str | None = None async def __call__(self, current: int, total: int, unit: ProgressUnit, *args: Any, **kwargs: Any) -> None: from plugins.helpers import progress as fmt_progress text = fmt_progress(current, total, unit) if not text or text == self._last_text: return self._last_text = text await self._reporter.report(text) class ParsePipeline: """ 将 解析 → 下载 → 格式转换 封装为一条流水线。 上传逻辑仍由调用方负责。 内置 Singleflight 机制:对同一 URL 的并发调用只会执行一次流水线, 其余调用等待 Event 完成后返回 None(调用方应重新检查缓存)。 首个调用方在完成上传+缓存后必须调用 finish() 以释放等待者。 """ def __init__( self, url: str, reporter: StatusReporter, parse_result: AnyParseResult | None = None, *, singleflight: bool = True, skip_media_processing: bool = False, skip_download_threshold: int = 0, richtext_skip_download: bool = True, save_metadata: bool = False, ): self._url = url self._reporter = reporter self._parse_result = parse_result self._waited = False self._singleflight = singleflight self._skip_media_processing = skip_media_processing self._skip_download_threshold = skip_download_threshold self._richtext_skip_download = richtext_skip_download self._save_metadata = save_metadata @property def waited(self) -> bool: """是否因 singleflight 而等待了其他流水线""" return self._waited def finish(self) -> None: """首个调用方完成上传+缓存后调用,释放所有等待者""" event = _inflight.pop(self._url, None) if event is not None: event.set() async def run(self) -> PipelineResult | None: """执行流水线,返回 PipelineResult 或 None(失败时已通知)""" if self._singleflight: key = self._url existing = _inflight.get(key) if existing is not None: self._waited = True logger.debug(f"Singleflight 命中, 等待已有流水线: url={key}") await self._reporter.report("已有相同任务正在解析, 等待解析完成...") await existing.wait() await self._reporter.dismiss() return None event = asyncio.Event() _inflight[key] = event try: result = await self._execute() if result is None: self.finish() # 流水线失败,立即释放等待者 return result except BaseException: self.finish() # 流水线异常,立即释放等待者 raise async def _execute(self) -> PipelineResult | None: """实际执行流水线逻辑""" logger.debug(f"流水线启动: url={self._url}, has_cached_result={self._parse_result is not None}") ps = ParseService() # ── 1. 解析 ── if self._parse_result is not None: logger.debug("使用缓存的解析结果") parse_result = self._parse_result else: await self._reporter.report("解 析 中...") parse_result = await self._step("解析", lambda: ps.parse(self._url)) if parse_result is None: return None if self._richtext_skip_download and parse_result.type == PostType.RICHTEXT: logger.debug("富文本跳过下载") return PipelineResult(parse_result=parse_result) if self._skip_download_threshold and len(to_list(parse_result.media)) > self._skip_download_threshold: logger.debug( f"媒体数量({len(to_list(parse_result.media))})大于设定值({self._skip_download_threshold}), 跳过下载" ) return PipelineResult(parse_result=parse_result) # ── 2. 下载 ── await self._reporter.report("下 载 中...") p = ps.parser.get_platform(self._url) proxy = pl_cfg.roll_downloader_proxy(p.id) logger.debug(f"使用配置: proxy={proxy}") progress_cb = PipelineProgressCallback(self._reporter) download_result: DownloadResult = await self._step( "下载", lambda: parse_result.download( bs.download_dir, callback=progress_cb, callback_args=(), proxy=proxy, save_metadata=self._save_metadata ), timeout=60 * 30, # 30分钟 ) if download_result is None: return None logger.debug(f"下载完成: output_dir={download_result.output_dir}") # ── 3. 格式转换 ── if self._skip_media_processing: logger.debug(f"流水线完成: download_result={download_result}") processed_list = [ProcessedMedia(i, [i.path]) for i in to_list(download_result.media)] return PipelineResult( parse_result=parse_result, processed_list=processed_list, output_dir=download_result.output_dir ) await self._reporter.report("处 理 中...") maybe_processed_list = await self._step( "格式转换", lambda: process_media_files(download_result), cleanup=lambda: shutil.rmtree(download_result.output_dir, ignore_errors=True), ) if maybe_processed_list is None: return None processed_list = maybe_processed_list logger.debug(f"流水线完成: processed_count={len(processed_list)}") return PipelineResult( parse_result=parse_result, processed_list=processed_list, output_dir=download_result.output_dir, ) async def _step[T]( self, stage: str, action: Callable[[], Awaitable[T]], cleanup: Callable[[], None] | None = None, timeout: float | None = None, ) -> T | None: """执行单个步骤,失败时统一处理""" logger.debug(f"执行步骤: {stage}") try: coro = action() if timeout is not None: return await asyncio.wait_for(coro, timeout=timeout) return await coro except TimeoutError: logger.error(f"{stage}超时 (>{timeout}s)") await self._reporter.report_error(stage, TimeoutError(f"{stage}超时 (>{timeout}s)")) if cleanup: cleanup() return None except Exception as e: logger.exception(e) logger.error(f"{stage}失败, 以上为错误信息") await self._reporter.report_error(stage, e) if cleanup: cleanup() return None ================================================ FILE: utils/__init__.py ================================================ ================================================ FILE: utils/converter.py ================================================ # FROM https://github.com/mercuree/html-telegraph-poster/blob/7212225e28a0206803c32e67d1185bbfbd1fc181/html_telegraph_poster/converter.py import re from lxml.html.clean import Cleaner allowed_tags = ( "a", "aside", "b", "blockquote", "br", "code", "em", "figcaption", "figure", "h3", "h4", "hr", "i", "iframe", "img", "li", "ol", "p", "pre", "s", "strong", "u", "ul", "video", ) telegram_embed_script_re = re.compile( r"""""", re.IGNORECASE, ) pre_content_re = re.compile(r"<(pre|code)(>|\s[^>]*>)[\s\S]*?\1>") line_breaks_inside_pre = re.compile(r"
start/end postion
for x in pre_content_re.finditer(html_string):
start, end = x.start(), x.end()
pre_ranges.extend((start, end))
pre_ranges.append(len(html_string))
# all odd elements are , leave them untouched
for k in range(1, len(pre_ranges)):
part = html_string[pre_ranges[k - 1] : pre_ranges[k]]
if k % 2 == 0:
out += line_breaks_inside_pre.sub("\n", part)
else:
out += line_breaks_and_empty_strings.sub(replace_by, part)
return out
================================================
FILE: utils/event_loop.py
================================================
import importlib
import sys
from log import logger
def setup_optimized_event_loop() -> bool:
"""配置优化的事件循环,自动选择winloop或uvloop"""
is_windows = sys.platform == "win32"
loop_module = "winloop" if is_windows else "uvloop"
try:
# 动态导入并安装事件循环
module = importlib.import_module(loop_module)
module.install()
logger.debug(f"{loop_module} 已启用")
return True
except ImportError:
logger.debug(f"{loop_module} 未安装")
logger.debug("使用标准 asyncio 事件循环")
return False
except Exception as e:
logger.debug(f"启用 {loop_module} 时出错: {e}")
logger.debug("使用标准 asyncio 事件循环")
return False
================================================
FILE: utils/helpers.py
================================================
import asyncio
import functools
import tarfile
import uuid
from collections.abc import Awaitable, Callable
from pathlib import Path
from typing import Any, overload
from log import logger
async def run_cmd(*cmd: str, timeout: float = 30) -> str:
"""运行外部命令并异步读取输出"""
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.DEVNULL,
)
try:
stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except TimeoutError:
proc.kill()
await proc.wait()
return ""
return stdout.decode().strip()
@overload
def to_list[T](v: list[T]) -> list[T]: ...
@overload
def to_list[T](v: T) -> list[T]: ...
def to_list[T](v: T | list[T]) -> list[T]:
return v if isinstance(v, list) else [v]
def pack_dir_to_tar_gz(dir_path: str | Path, output_path: str | Path | None = None) -> Path:
"""
将目录打包为 tar.gz,返回压缩包路径。
Args:
dir_path: 要打包的目录
output_path: 输出压缩包路径;不传则默认生成同名 .tar.gz
Returns:
生成的 tar.gz 文件路径
"""
source_dir = Path(dir_path).resolve()
if not source_dir.is_dir():
raise ValueError(f"不是有效目录: {source_dir}")
if output_path is None:
output_path = source_dir.with_suffix(".tar.gz")
else:
output_path = Path(output_path).resolve()
with tarfile.open(output_path, "w:gz") as tar:
tar.add(source_dir, arcname=source_dir.name)
return output_path
def with_request_id[T](func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
@functools.wraps(func)
async def wrapper(*args: Any, **kwargs: Any) -> T:
request_id = str(uuid.uuid4())[:8]
with logger.contextualize(req_id=request_id):
return await func(*args, **kwargs)
return wrapper
================================================
FILE: utils/media_processing_unit.py
================================================
"""媒体处理器 — 将图片/视频转换为 Telegram 兼容格式"""
import asyncio
import math
import mimetypes
import os
import time
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from haishoku.haishoku import Haishoku
from loguru import logger
from PIL import Image, ImageOps
from PIL.Image import Resampling
from utils.helpers import run_cmd
@dataclass
class MediaProcessResult:
"""统一处理结果"""
output_paths: list[Path]
temp_dir: Path | None = None
class MediaProcessingUnit:
"""媒体处理器,将媒体转换为 Telegram 兼容的格式
Telegram 限制:
- 图片宽高比 / 高宽比不能超过 20:1
- 单次最多发送 10 张图片
用法:
mpu = MediaProcessingUnit(output_dir=Path("./output"))
result = await mpu.process("media.mp4")
"""
def __init__(
self,
output_dir: str | Path,
segment_height: int = 1400,
medium_threshold: int = 2,
overlap: int = 100,
logger: Callable = logger.info,
):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.segment_height = segment_height
self.medium_threshold = medium_threshold
self.overlap = overlap
self.logger = logger
# ------------------------------------------------------------------ #
# 公共入口
# ------------------------------------------------------------------ #
async def process(self, file_path: str | Path) -> MediaProcessResult:
media_type = self.get_media_type_by_mime(file_path)
self.logger(f"开始处理媒体: path={file_path}, type={media_type}")
if media_type == "image":
return await self.process_image(Path(file_path))
elif media_type == "video":
return await self.process_video(Path(file_path))
else:
raise ValueError(f"Unsupported media type: {file_path}")
# ------------------------------------------------------------------ #
# 图片处理
# ------------------------------------------------------------------ #
async def process_image(self, file_path: Path) -> MediaProcessResult:
ext = file_path.suffix.lower()
needs_convert = ext in {".heif", ".heic", ".avif"}
intermediates: list[Path] = [] # 统一收集中间文件
try:
if needs_convert:
self.logger(f"图片格式需转换: {ext} -> webp")
source = await asyncio.to_thread(self._img2webp, file_path)
intermediates.append(source)
else:
source = file_path
if result := await asyncio.to_thread(self._adapt_image, source):
return result
# _adapt_image 无需处理,尝试 downscale
if downscaled := await asyncio.to_thread(self._downscale_image, source):
intermediates.append(downscaled)
source = downscaled
intermediates = [p for p in intermediates if p != source]
return MediaProcessResult(output_paths=[source])
finally:
for p in intermediates:
if p.exists():
self.logger(f"删除中间文件: {p}")
os.remove(p)
def _adapt_image(self, file_path: Path) -> MediaProcessResult | None:
"""分析图片尺寸并做填充 / 切割,返回 None 表示无需处理"""
with Image.open(file_path) as img:
w, h = img.width, img.height
wh_ratio = w / h
hw_ratio = h / w
self.logger(f"图片尺寸: {w}x{h}, wh_ratio={wh_ratio:.2f}, hw_ratio={hw_ratio:.2f}")
if w >= h:
# 横图
if wh_ratio <= 20:
self.logger("横图比例正常,跳过处理")
return None
self.logger("横图比例超限,需要填充")
padding = self._calc_padding_horizontal(w, h)
with Image.open(file_path) as img:
return self._pad_image(file_path, img, padding)
else:
# 竖图
if hw_ratio <= 5 or (w < 200 and hw_ratio < 20):
self.logger("竖图比例正常,跳过处理")
return None
if w < 200 and hw_ratio > 20:
self.logger("窄竖图比例超限,需要填充")
padding = self._calc_padding_vertical(w, h)
with Image.open(file_path) as img:
return self._pad_image(file_path, img, padding)
# 长图切割
segments = h // self.segment_height
seg_h = h // 2 if segments < self.medium_threshold else self.segment_height
self.logger(f"长图切割: segments={segments}, seg_h={seg_h}")
return self._split_image(file_path, seg_h)
def _img2webp(self, file_path: Path) -> Path:
with Image.open(file_path) as pil_img:
img = pil_img.convert("RGBA") if pil_img.mode != "RGBA" else pil_img
output = self.output_dir / file_path.with_suffix(".webp").name
img.save(output, format="WEBP")
self.logger(f"webp 转换完成: {output}")
return output
def _downscale_image(self, file_path: Path, max_side: int = 2560) -> Path | None:
"""若图片任一边超过 max_side,等比缩放至长边为 max_side,返回新文件路径;无需缩放返回 None"""
with Image.open(file_path) as img:
w, h = img.size
if max(w, h) <= max_side:
return None
scale = max_side / max(w, h)
new_w, new_h = int(w * scale), int(h * scale)
self.logger(f"图片长边超限({max(w, h)}px > {max_side}px),缩放: {w}x{h} -> {new_w}x{new_h}")
resized = img.resize((new_w, new_h), Resampling.LANCZOS)
ext = (img.format and f".{img.format.lower()}") or file_path.suffix
out_path = self.output_dir / f"downscaled_{time.time_ns()}{ext}"
resized.save(out_path)
return out_path
# -- 图片辅助 --------------------------------------------------------- #
@staticmethod
def _calc_padding_horizontal(w: int, h: int) -> tuple[int, int, int, int]:
h_padding = w // 20 - h // 2
return 0, h_padding, 0, h_padding
@staticmethod
def _calc_padding_vertical(w: int, h: int) -> tuple[int, int, int, int]:
w_padding = h // 20 - w // 2
return w_padding, 0, w_padding, 0
@staticmethod
def _get_dominant_color(file_path: Path) -> tuple[int, ...]:
haishoku = Haishoku.loadHaishoku(str(file_path))
return tuple(int(v * 0.8) for v in haishoku.palette[0][1])
def _pad_image(
self,
file_path: Path,
img: Image.Image,
padding: tuple[int, int, int, int],
) -> MediaProcessResult:
fill_color = self._get_dominant_color(file_path)
padded = ImageOps.expand(img, padding, fill=fill_color)
out_path = self.output_dir / f"padded_{time.time_ns()}.png"
padded.save(out_path)
self.logger(f"填充完成: padding={padding}, color={fill_color}, output={out_path}")
return MediaProcessResult(output_paths=[out_path])
def _split_image(self, file_path: Path, segment_height: int) -> MediaProcessResult:
temp_dir = self.output_dir / f"split_{time.time_ns()}"
temp_dir.mkdir(parents=True, exist_ok=True)
segments = self._do_split(file_path, temp_dir, segment_height)
self.logger(f"图片切割完成: {len(segments)} 段, output_dir={temp_dir}")
return MediaProcessResult(output_paths=segments, temp_dir=temp_dir)
def _do_split(
self,
input_path: Path,
output_dir: Path,
segment_height: int,
) -> list[Path]:
with Image.open(input_path) as img:
width, height = img.size
num_segments = math.ceil(height / segment_height)
self.logger(f"切割参数: size={width}x{height}, segment_h={segment_height}, num={num_segments}")
result: list[Path] = []
for i in range(num_segments):
top = i * segment_height - (self.overlap if i != 0 else 0)
bottom = min((i + 1) * segment_height, height)
segment = img.crop((0, top, width, bottom))
out_path = output_dir / f"segment_{i + 1:03d}.png"
segment.save(out_path)
result.append(out_path)
return result
# ------------------------------------------------------------------ #
# 视频处理
# ------------------------------------------------------------------ #
async def process_video(self, file_path: Path) -> MediaProcessResult:
codec = await self.get_video_codec(file_path)
self.logger(f"视频编码: codec={codec}, path={file_path}")
converted: Path | None = None
if codec != "h264":
self.logger("编码非 h264,开始转码")
converted = await self.ensure_h264(file_path)
self.logger(f"转码完成: {converted}")
source = converted or file_path
video_size = source.stat().st_size
self.logger(f"视频大小: {video_size / 1024 / 1024:.1f} MB")
if video_size > 2 * 1024**3: # 2 GiB
self.logger("视频超过 2 GiB,开始分割")
output_paths, output_dir = await self.split_video(source, self.output_dir)
if converted:
os.remove(converted)
return MediaProcessResult(output_paths=output_paths, temp_dir=output_dir)
return MediaProcessResult(output_paths=[source])
@staticmethod
async def get_video_codec(file_path: Path) -> str:
out = await run_cmd(
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=codec_name",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(file_path),
)
return out.strip().lower() if out else ""
@staticmethod
async def get_duration(file_path: Path) -> float:
out = await run_cmd(
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(file_path),
)
return float(out.strip()) if out else 0.0
async def ensure_h264(self, file_path: Path) -> Path:
out = self.output_dir / (file_path.stem + "_h264" + file_path.suffix)
duration = await self.get_duration(file_path)
height = await self._get_video_height(file_path)
cmd = self._build_sw_transcode_cmd(file_path, out, duration, height)
self.logger(f"h264 转码: {file_path.name} -> {out.name}, duration={duration:.0f}s, encoder=SW:libx264")
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL,
)
await proc.wait()
if out.exists() and out.stat().st_size > 0:
self.logger(f"h264 转码成功: size={out.stat().st_size / 1024 / 1024:.1f}MB")
return out
self.logger(f"h264 转码失败,返回原文件: {file_path}")
return file_path
@staticmethod
async def _get_video_height(file_path: Path) -> int:
out = await run_cmd(
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=height",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(file_path),
)
return int(out.strip()) if out and out.strip().isdigit() else 0
def _build_sw_transcode_cmd(self, file_path: Path, out: Path, duration: float, height: int) -> list[str]:
if duration <= 30:
preset, crf = "slow", "18"
elif duration <= 60:
preset, crf = "medium", "20"
elif duration <= 600:
preset, crf = "fast", "23"
elif duration <= 1800:
preset, crf = "veryfast", "26"
else:
preset, crf = "ultrafast", "28"
scale = ["-vf", "scale=-2:720"] if duration > 1800 and height > 720 else []
self.logger(f"SW 转码策略: preset={preset}, crf={crf}, scale={'720p' if scale else 'original'}")
return [
"ffmpeg",
"-i",
str(file_path),
"-c:v",
"libx264",
"-preset",
preset,
"-crf",
crf,
*scale,
"-c:a",
"aac",
"-y",
str(out),
]
async def split_video(
self,
file_path: Path,
output_dir: Path,
size_limit: int = 2_000_000_000,
ffmpeg_args: list[str] | None = None,
keep_sec: float = 1.0,
) -> tuple[list[Path], Path]:
if ffmpeg_args is None:
ffmpeg_args = ["-c", "copy"]
base = file_path.stem
split_dir = output_dir / f"{base}_split"
split_dir.mkdir(parents=True, exist_ok=True)
ext = file_path.suffix.lstrip(".")
total_duration = int(await self.get_duration(file_path))
self.logger(f"视频分割: duration={total_duration}s, size_limit={size_limit}")
cur, part, output_paths = 0, 1, []
while cur < total_duration:
out_file = split_dir / f"{base}_part_{part:03d}.{ext}"
output_paths.append(out_file)
cmd = [
"ffmpeg",
"-ss",
str(cur),
"-i",
str(file_path),
"-fs",
str(size_limit),
*ffmpeg_args,
"-y",
str(out_file),
]
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL,
)
await proc.wait()
new_dur = int(await self.get_duration(out_file))
self.logger(f"分割 part {part}: offset={cur}s, duration={new_dur}s, file={out_file}")
if new_dur <= 0:
break
cur += new_dur
if cur < total_duration:
cur = max(cur - int(keep_sec), 0)
part += 1
self.logger(f"视频分割完成: {len(output_paths)} 段")
return output_paths, split_dir
# ------------------------------------------------------------------ #
# 工具方法
# ------------------------------------------------------------------ #
@staticmethod
def get_media_type_by_mime(file_path: str | Path) -> str:
mime, _ = mimetypes.guess_type(str(file_path))
if mime:
if mime.startswith("image/"):
return "image"
if mime.startswith("video/"):
return "video"
return "unknown"
async def main() -> None:
mpu = MediaProcessingUnit(output_dir=Path(r"D:\Downloads\新建文件夹"))
result = await mpu.process(r"D:\Downloads\36751083810-1-30066.mp4")
print(result.output_paths)
if __name__ == "__main__":
asyncio.run(main())
================================================
FILE: utils/ph.py
================================================
import random
from dataclasses import dataclass
from typing import Any
from telegraph.aio import Telegraph as TelegraphAPI
class Telegraph:
"""Telegraph API 封装"""
def __init__(self, token: str | None = None, domain: str = "telegra.ph"):
self.token = token
self.domain = domain
self.telegraph = TelegraphAPI(access_token=token, domain=domain)
async def create_account(
self, short_name: str, author_name: str | None = None, author_url: str | None = None
) -> "TelegraphAccount":
"""创建 Telegraph 账户"""
account = await self.telegraph.create_account(short_name, author_name, author_url)
acc_info = await self.get_account_info(account)
self.token = acc_info.access_token
return acc_info
async def get_account_info(self, account_info: dict[str, str] | None = None) -> "TelegraphAccount":
"""获取 Telegraph 账户信息"""
account_info = account_info or await self.telegraph.get_account_info(
[
"short_name",
"author_name",
"author_url",
"auth_url",
]
)
return TelegraphAccount(
self.telegraph.get_access_token(),
account_info["short_name"],
account_info["author_name"],
account_info["author_url"],
account_info["auth_url"],
)
async def create_page(
self,
title: str,
content: list[dict[str, Any]] | None = None,
html_content: str | None = None,
author_name: str | None = None,
author_url: str | None = None,
return_content: bool = False,
auto_create_account: bool = True,
) -> "TelegraphPage":
"""创建 Telegraph 页面"""
if auto_create_account and not self.token:
# 随机用户名
short_name = "tg_" + str(random.randint(100000, 999999))
await self.create_account(short_name)
response = await self.telegraph.create_page(
title,
content,
html_content,
author_name,
author_url,
return_content,
)
return TelegraphPage(
response["path"],
response["url"],
response["title"],
response["description"],
response["views"],
response["can_edit"],
await self.get_account_info(),
)
@dataclass
class TelegraphAccount:
access_token: str
short_name: str
author_name: str
author_url: str
auth_url: str
@dataclass
class TelegraphPage:
path: str
url: str
title: str
description: str
views: int
can_edit: bool
account: TelegraphAccount