Repository: SocialSisterYi/bcut-asr Branch: main Commit: 458313e2c9cc Files: 7 Total size: 22.1 KB Directory structure: gitextract_pq110vcu/ ├── .gitignore ├── LICENSE ├── README.md ├── bcut_asr/ │ ├── __init__.py │ ├── __main__.py │ └── orm.py └── pyproject.toml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 社会易姐QwQ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

Bcut-ASR

使用必剪 API 进行云端语音字幕识别,支持 CLI 和 module 调用 ## ✨Feature - 可直接上传`flac`, `aac`, `m4a`, `mp3`, `wav`音频格式 - 自动调用 ffmpeg, 实现视频伴音和其他音频格式转码 - 支持`srt`, `json`, `lrc`, `txt`格式字幕输出 - 字幕支持断句和首位时间标记 - 可使用 stdout 输出字幕文本 ## 🚀Install 首先确保 ffmpeg 已安装,且 PATH 中可以访问,若未安装可以使用如下命令(已安装请无视): Linux: ```bash sudo apt install ffmpeg ``` Windows: ```powershell winget install ffmpeg ``` 本项目暂时未发布 pypi,应使用本地安装,Python 版本应 >= 3.10,需要安装 poetry ```bash git clone https://github.com/SocialSisterYi/bcut-asr cd bcut-asr poetry lock poetry build -f wheel pip install dist/bcut_asr-0.0.3-py3-none-any.whl # Example ``` ## 📃Usage ### CLI Interface ```bash bcut_asr video.mp4 ``` 或 ```bash bcut_asr video.mp4 subtitle.srt ``` 或 ```bash bcut_asr video.mp4 -f srt - > subtitle.srt ``` 长音频指定任务状态轮询间隔(秒),避免接口频繁调用 ```bash bcut_asr video.mp4 -f srt -i 30 - > subtitle.srt ``` ``` bcut_asr -h usage: bcut-asr [-h] [-f [{srt,json,lrc,txt}]] [-i [1.0]] input [output] 必剪语音识别 positional arguments: input 输入媒体文件 output 输出字幕文件, 可stdout options: -h, --help show this help message and exit -f [{srt,json,lrc,txt}], --format [{srt,json,lrc,txt}] 输出字幕格式 -i [1.0], --interval [1.0] 任务状态轮询间隔(秒) 支持输入音频格式: flac, aac, m4a, mp3, wav 支持自动调用ffmpeg提取视频伴音 ``` ### Module ```python from bcut_asr import BcutASR from bcut_asr.orm import ResultStateEnum asr = BcutASR('voice.mp3') asr.upload() # 上传文件 asr.create_task() # 创建任务 # 轮询检查结果 while True: result = asr.result() # 判断识别成功 if result.state == ResultStateEnum.COMPLETE: break # 解析字幕内容 subtitle = result.parse() # 判断是否存在字幕 if subtitle.has_data(): # 输出srt格式 print(subtitle.to_srt()) ``` 输入视频 ```python from bcut_asr import run_everywhere from argparse import Namespace f = open("file.mp4", "rb") argg = Namespace(format="srt", interval=30.0, input=f, output=None) run_everywhere(argg) ``` ================================================ FILE: bcut_asr/__init__.py ================================================ import logging import sys import time from os import PathLike from pathlib import Path from typing import Literal, Optional import ffmpeg import requests from .orm import ( ResourceCompleteRspSchema, ResourceCreateRspSchema, ResultRspSchema, ResultStateEnum, TaskCreateRspSchema, ) __version__ = "0.0.3" API_BASE_URL = "https://member.bilibili.com/x/bcut/rubick-interface" # 申请上传 API_REQ_UPLOAD = API_BASE_URL + "/resource/create" # 提交上传 API_COMMIT_UPLOAD = API_BASE_URL + "/resource/create/complete" # 创建任务 API_CREATE_TASK = API_BASE_URL + "/task" # 查询结果 API_QUERY_RESULT = API_BASE_URL + "/task/result" SUPPORT_SOUND_FORMAT = Literal["flac", "aac", "m4a", "mp3", "wav"] INFILE_FMT = ["flac", "aac", "m4a", "mp3", "wav"] OUTFILE_FMT = ["srt", "json", "lrc", "txt"] def ffmpeg_render(media_file: str) -> bytes: "提取视频伴音并转码为aac格式" out, err = ( ffmpeg.input(media_file, v="warning") .output("pipe:", ac=1, format="adts") .run(capture_stdout=True) ) return out def run_everywhere(argg): logging.basicConfig( format="%(asctime)s - [%(levelname)s] %(message)s", level=logging.INFO ) # 处理输入文件情况 infile = argg.input infile_name = infile.name if infile_name == "": logging.error("输入文件错误") sys.exit(-1) suffix = infile_name.rsplit(".", 1)[-1] if suffix in INFILE_FMT: infile_fmt = suffix infile_data = infile.read() else: # ffmpeg分离视频伴音 logging.info("非标准音频文件, 尝试调用ffmpeg转码") try: infile_data = ffmpeg_render(infile_name) except ffmpeg.Error: logging.error("ffmpeg转码失败") sys.exit(-1) else: logging.info("ffmpeg转码完成") infile_fmt = "aac" # 处理输出文件情况 outfile = argg.output if outfile is None: # 未指定输出文件,默认为文件名同输入,可以 -t 传参,默认str格式 if argg.format is not None: outfile_fmt = argg.format else: outfile_fmt = "srt" else: # 指定输出文件 outfile_name = outfile.name if outfile.name == "": # stdout情况,可以 -t 传参,默认str格式 if argg.format is not None: outfile_fmt = argg.format else: outfile_fmt = "srt" else: suffix = outfile_name.rsplit(".", 1)[-1] if suffix in OUTFILE_FMT: outfile_fmt = suffix else: logging.error("输出格式错误") sys.exit(-1) interval = argg.interval if interval is None: interval = 30.0 # 开始执行转换逻辑 asr = BcutASR() asr.set_data(raw_data=infile_data, data_fmt=infile_fmt) try: # 上传文件 asr.upload() # 创建任务 task_id = asr.create_task() while True: # 轮询检查任务状态 task_resp = asr.result() match task_resp.state: case ResultStateEnum.STOP: logging.info(f"等待识别开始") case ResultStateEnum.RUNING: logging.info(f"识别中-{task_resp.remark}") case ResultStateEnum.ERROR: logging.error(f"识别失败-{task_resp.remark}") sys.exit(-1) case ResultStateEnum.COMPLETE: outfile_name = f"{infile_name.rsplit('.', 1)[-2]}.{outfile_fmt}" outfile = open(outfile_name, "w", encoding="utf8") logging.info(f"识别成功") # 识别成功, 回读字幕数据 result = task_resp.parse() break time.sleep(interval) if not result.has_data(): logging.error("未识别到语音") sys.exit(-1) match outfile_fmt: case "srt": outfile.write(result.to_srt()) case "lrc": outfile.write(result.to_lrc()) case "json": outfile.write(result.json()) case "txt": outfile.write(result.to_txt()) outfile.close() logging.info(f"转换成功: {outfile_name}") except APIError as err: logging.error(f"接口错误: {err.__str__()}") sys.exit(-1) class APIError(Exception): "接口调用错误" def __init__(self, code, msg) -> None: self.code = code self.msg = msg super().__init__() def __str__(self) -> str: return f"{self.code}:{self.msg}" class BcutASR: "必剪 语音识别接口" session: requests.Session sound_name: str sound_bin: bytes sound_fmt: SUPPORT_SOUND_FORMAT __in_boss_key: str __resource_id: str __upload_id: str __upload_urls: list[str] __per_size: int __clips: int __etags: list[str] __download_url: str task_id: str def __init__(self, file: Optional[str | PathLike] = None) -> None: self.session = requests.Session() self.task_id = None self.__etags = [] if file: self.set_data(file) def set_data( self, file: Optional[str | PathLike] = None, raw_data: Optional[bytes] = None, data_fmt: Optional[SUPPORT_SOUND_FORMAT] = None, ) -> None: "设置欲识别的数据" if file: if not isinstance(file, (str, PathLike)): raise TypeError("unknow file ptr") # 文件类 file = Path(file) self.sound_bin = open(file, "rb").read() suffix = data_fmt or file.suffix[1:] self.sound_name = file.name elif raw_data: # bytes类 self.sound_bin = raw_data suffix = data_fmt self.sound_name = f"{int(time.time())}.{suffix}" else: raise ValueError("none set data") if suffix not in SUPPORT_SOUND_FORMAT.__args__: raise TypeError("format is not support") self.sound_fmt = suffix logging.info(f"加载文件成功: {self.sound_name}") def upload(self) -> None: "申请上传" if not self.sound_bin or not self.sound_fmt: raise ValueError("none set data") resp = self.session.post( API_REQ_UPLOAD, data={ "type": 2, "name": self.sound_name, "size": len(self.sound_bin), "resource_file_type": self.sound_fmt, "model_id": 7, }, ) resp.raise_for_status() resp = resp.json() code = resp["code"] if code: raise APIError(code, resp["message"]) resp_data = ResourceCreateRspSchema.parse_obj(resp["data"]) self.__in_boss_key = resp_data.in_boss_key self.__resource_id = resp_data.resource_id self.__upload_id = resp_data.upload_id self.__upload_urls = resp_data.upload_urls self.__per_size = resp_data.per_size self.__clips = len(resp_data.upload_urls) logging.info( f"申请上传成功, 总计大小{resp_data.size // 1024}KB, {self.__clips}分片, 分片大小{resp_data.per_size // 1024}KB: {self.__in_boss_key}" ) self.__upload_part() self.__commit_upload() def __upload_part(self) -> None: "上传音频数据" for clip in range(self.__clips): start_range = clip * self.__per_size end_range = (clip + 1) * self.__per_size logging.info(f"开始上传分片{clip}: {start_range}-{end_range}") resp = self.session.put( self.__upload_urls[clip], data=self.sound_bin[start_range:end_range], ) resp.raise_for_status() etag = resp.headers.get("Etag") self.__etags.append(etag) logging.info(f"分片{clip}上传成功: {etag}") def __commit_upload(self) -> None: "提交上传数据" resp = self.session.post( API_COMMIT_UPLOAD, data={ "in_boss_key": self.__in_boss_key, "resource_id": self.__resource_id, "etags": ",".join(self.__etags), "upload_id": self.__upload_id, "model_id": 7, }, ) resp.raise_for_status() resp = resp.json() code = resp["code"] if code: raise APIError(code, resp["message"]) resp_data = ResourceCompleteRspSchema.model_validate(resp["data"]) self.__download_url = resp_data.download_url logging.info(f"提交成功") def create_task(self) -> str: "开始创建转换任务" resp = self.session.post( API_CREATE_TASK, json={"resource": self.__download_url, "model_id": "7"} ) resp.raise_for_status() resp = resp.json() code = resp["code"] if code: raise APIError(code, resp["message"]) resp_data = TaskCreateRspSchema.model_validate(resp["data"]) self.task_id = resp_data.task_id logging.info(f"任务已创建: {self.task_id}") return self.task_id def result(self, task_id: Optional[str] = None) -> ResultRspSchema: "查询转换结果" resp = self.session.get( API_QUERY_RESULT, params={"model_id": 7, "task_id": task_id or self.task_id} ) resp.raise_for_status() resp = resp.json() code = resp["code"] if code: raise APIError(code, resp["message"]) return ResultRspSchema.model_validate(resp["data"]) ================================================ FILE: bcut_asr/__main__.py ================================================ import logging import sys import time from argparse import ArgumentParser, FileType import ffmpeg from . import APIError, BcutASR, ResultStateEnum logging.basicConfig( format="%(asctime)s - [%(levelname)s] %(message)s", level=logging.INFO, ) INFILE_FMT = ["flac", "aac", "m4a", "mp3", "wav"] OUTFILE_FMT = ["srt", "json", "lrc", "txt"] parser = ArgumentParser( prog="bcut-asr", description="必剪语音识别\n", epilog=f"支持输入音频格式: {', '.join(INFILE_FMT)} 支持自动调用ffmpeg提取视频伴音", ) parser.add_argument( "-f", "--format", nargs="?", default="srt", choices=OUTFILE_FMT, help="输出字幕格式" ) parser.add_argument( "-i", "--interval", nargs="?", type=float, default="1.0", metavar="1.0", help="任务状态轮询间隔(秒)", ) parser.add_argument("input", type=FileType("rb"), help="输入媒体文件") parser.add_argument( "output", nargs="?", type=FileType("w", encoding="utf8"), help="输出字幕文件, 可stdout", ) def ffmpeg_render(media_file: str) -> bytes: "提取视频伴音并转码为aac格式" out, err = ( ffmpeg.input(media_file, v="warning") .output("pipe:", ac=1, format="adts") .run(capture_stdout=True) ) return out def main(): # 处理输入文件情况 args = parser.parse_args() infile = args.input infile_name = infile.name if infile_name == "": logging.error("输入文件错误") return -1 suffix = infile_name.rsplit(".", 1)[-1] if suffix in INFILE_FMT: infile_fmt = suffix infile_data = infile.read() else: # ffmpeg分离视频伴音 logging.info("非标准音频文件, 尝试调用ffmpeg转码") try: infile_data = ffmpeg_render(infile_name) except ffmpeg.Error: logging.error("ffmpeg转码失败") return -1 else: logging.info("ffmpeg转码完成") infile_fmt = "aac" # 处理输出文件情况 outfile = args.output if outfile is None: # 未指定输出文件,默认为文件名同输入,可以 -t 传参,默认str格式 if args.format is not None: outfile_fmt = args.format else: outfile_fmt = "srt" else: # 指定输出文件 outfile_name = outfile.name if outfile.name == "": # stdout情况,可以 -t 传参,默认str格式 if args.format is not None: outfile_fmt = args.format else: outfile_fmt = "srt" else: suffix = outfile_name.rsplit(".", 1)[-1] if suffix in OUTFILE_FMT: outfile_fmt = suffix else: logging.error("输出格式错误") return -1 interval = args.interval if interval is None: interval = 1.0 # 开始执行转换逻辑 asr = BcutASR() asr.set_data(raw_data=infile_data, data_fmt=infile_fmt) try: # 上传文件 asr.upload() # 创建任务 task_id = asr.create_task() while True: # 轮询检查任务状态 task_resp = asr.result() match task_resp.state: case ResultStateEnum.STOP: logging.info(f"等待识别开始") case ResultStateEnum.RUNING: logging.info(f"识别中 {task_resp.remark}") case ResultStateEnum.ERROR: logging.error(f"识别失败 {task_resp.remark}") sys.exit(-1) case ResultStateEnum.COMPLETE: logging.info(f"识别成功") outfile_name = f"{infile_name.rsplit('.', 1)[-2]}.{outfile_fmt}" outfile = open(outfile_name, "w", encoding="utf8") # 识别成功, 回读字幕数据 result = task_resp.parse() break time.sleep(interval) if not result.has_data(): logging.error("未识别到语音") return -1 match outfile_fmt: case "srt": outfile.write(result.to_srt()) case "lrc": outfile.write(result.to_lrc()) case "json": outfile.write(result.model_dump_json()) case "txt": outfile.write(result.to_txt()) outfile.close() logging.info(f"转换成功: {outfile_name}") except APIError as err: logging.error(f"接口错误: {err.__str__()}") return -1 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: bcut_asr/orm.py ================================================ from enum import Enum from typing import Optional from pydantic import BaseModel class ASRDataSeg(BaseModel): """文字识别-断句""" class ASRDataWords(BaseModel): """文字识别-逐字""" label: str start_time: int end_time: int start_time: int end_time: int transcript: str words: list[ASRDataWords] def to_srt_ts(self) -> str: """转换为srt时间戳""" def _conv(ms: int) -> tuple[int, int, int, int]: return ms // 3600000, ms // 60000 % 60, ms // 1000 % 60, ms % 1000 s_h, s_m, s_s, s_ms = _conv(self.start_time) e_h, e_m, e_s, e_ms = _conv(self.end_time) return f"{s_h:02d}:{s_m:02d}:{s_s:02d},{s_ms:03d} --> {e_h:02d}:{e_m:02d}:{e_s:02d},{e_ms:03d}" def to_lrc_ts(self) -> str: """转换为lrc时间戳""" def _conv(ms: int) -> tuple[int, int, int]: return ms // 60000, ms // 1000 % 60, ms % 1000 // 10 s_m, s_s, s_ms = _conv(self.start_time) return f"[{s_m:02d}:{s_s:02d}.{s_ms:02d}]" class ASRData(BaseModel): """语音识别结果""" utterances: list[ASRDataSeg] version: str def __iter__(self): return iter(self.utterances) def has_data(self) -> bool: """是否识别到数据""" return len(self.utterances) > 0 def to_txt(self) -> str: """转成 txt 格式字幕 (无时间标记)""" return "\n".join(seg.transcript for seg in self.utterances) def to_srt(self) -> str: """转成 srt 格式字幕""" return "\n".join( f"{n}\n{seg.to_srt_ts()}\n{seg.transcript}\n" for n, seg in enumerate(self.utterances, 1) ) def to_lrc(self) -> str: """转成 lrc 格式字幕""" return "\n".join( f"{seg.to_lrc_ts()}{seg.transcript}" for seg in self.utterances ) def to_ass(self) -> str: """转换为 ass 格式""" # TODO: ass 序列化实现 raise NotImplementedError class ResourceCreateRspSchema(BaseModel): """上传申请响应""" resource_id: str title: str type: int in_boss_key: str size: int upload_urls: list[str] upload_id: str per_size: int class ResourceCompleteRspSchema(BaseModel): """上传提交响应""" resource_id: str download_url: str class TaskCreateRspSchema(BaseModel): """任务创建响应""" resource: str result: str task_id: str # 任务id class ResultStateEnum(Enum): """任务状态枚举""" STOP = 0 # 未开始 RUNING = 1 # 运行中 ERROR = 3 # 错误 COMPLETE = 4 # 完成 class ResultRspSchema(BaseModel): """任务结果查询响应""" task_id: str # 任务id result: Optional[str] = None # 结果数据-json 在 state 1 的情况为 None remark: str # 任务状态详情 state: ResultStateEnum # 任务状态 def parse(self) -> ASRData: "解析结果数据" return ASRData.model_validate_json(self.result) ================================================ FILE: pyproject.toml ================================================ [tool.poetry] name = "bcut-asr" version = "0.0.3" description = "使用必剪API的语音字幕识别" authors = ["SocialSisterYi <1440239038@qq.com>"] license = "MIT License" readme = "README.md" [tool.poetry.dependencies] python = ">=3.10" requests = "^2.31.0" pydantic = "^2.7.0" ffmpeg-python = "^0.2.0" [tool.poetry.scripts] bcut-asr = "bcut_asr.__main__:main" [tool.poetry.group.dev.dependencies] black = "^24.4.0" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api"