Repository: rhasspy/wyoming-faster-whisper Branch: main Commit: df3e5bac007b Files: 32 Total size: 51.6 KB Directory structure: gitextract_nf4x59hb/ ├── .dockerignore ├── .github/ │ └── workflows/ │ ├── publish.yml │ └── test.yml ├── .gitignore ├── .isort.cfg ├── .projectile ├── CHANGELOG.md ├── Dockerfile ├── LICENSE.md ├── README.md ├── docker_run.sh ├── mypy.ini ├── pylintrc ├── pyproject.toml ├── script/ │ ├── format │ ├── lint │ ├── package │ ├── run │ ├── setup │ └── test ├── setup.cfg ├── tests/ │ ├── __init__.py │ └── test_faster_whisper.py └── wyoming_faster_whisper/ ├── __init__.py ├── __main__.py ├── const.py ├── dispatch_handler.py ├── faster_whisper_handler.py ├── models.py ├── onnx_asr_handler.py ├── sherpa_handler.py └── transformers_whisper.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ * !wyoming_faster_whisper/*.py !pyproject.toml !docker_run.sh ================================================ FILE: .github/workflows/publish.yml ================================================ --- name: Release (DockerHub + GitHub) on: workflow_dispatch: push: tags: - "v*.*.*" # e.g. v1.2.3 jobs: build: runs-on: ubuntu-latest outputs: tag: ${{ steps.read_ver.outputs.tag }} file_ver: ${{ steps.read_ver.outputs.file_ver }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.13" - name: Install build tools run: pip install tomli setuptools wheel build - name: Read version from pyproject.toml & verify tag id: read_ver shell: python run: | import os, sys try: import tomllib as toml # Python 3.11+ except Exception: import tomli as toml # fallback # Read pyproject version with open("pyproject.toml","rb") as f: ver = toml.load(f)["project"]["version"] # Extract tag from GITHUB_REF (e.g. "refs/tags/v1.2.3" -> "1.2.3") ref = os.environ["GITHUB_REF"] # Safety checks & strip prefix prefix = "refs/tags/v" if not ref.startswith(prefix): print(f"Unexpected GITHUB_REF: {ref}", file=sys.stderr) sys.exit(1) tag = ref[len(prefix):] if tag != ver: print(f"Tag ({tag}) != pyproject.toml ({ver})", file=sys.stderr) sys.exit(1) # Expose outputs with open(os.environ["GITHUB_OUTPUT"], "a") as fh: fh.write(f"tag={tag}\n") fh.write(f"file_ver={ver}\n") print(f"Version OK: tag={tag} matches pyproject.toml={ver}") - name: Build wheel run: | python -m build --wheel --sdist - name: Upload wheels and sdist uses: actions/upload-artifact@v4 with: name: dist-wyoming-faster-whisper path: | dist/*.whl dist/*.tar.gz if-no-files-found: error # Enables building foreign architectures (e.g., arm64 on amd64 runner) - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Log in to DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Build uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile platforms: linux/amd64,linux/arm64 push: true tags: | rhasspy/wyoming-whisper:latest rhasspy/wyoming-whisper:${{ steps.read_ver.outputs.tag }} cache-from: type=gha cache-to: type=gha,mode=max changelog: needs: build runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Extract release notes (fail if missing) id: notes shell: python env: TAG: ${{ needs.build.outputs.tag }} run: | import os, sys, re, pathlib ver = os.environ["TAG"] # e.g., 1.0.2 chlog_path = pathlib.Path("CHANGELOG.md") if not chlog_path.exists(): print("CHANGELOG.md not found", file=sys.stderr) sys.exit(2) lines = chlog_path.read_text(encoding="utf-8").splitlines() def is_heading_for_version(s: str) -> bool: s = s.strip() if not s.startswith("##"): return False s = s[2:].strip() # drop "##" s = s.strip("[]") # allow [1.0.2] s = re.sub(r"\s*-\s*.*$", "", s) # drop " - date" s = s.lstrip("v") # allow v1.0.2 return s == ver start_idx = None for i, line in enumerate(lines): if is_heading_for_version(line): start_idx = i + 1 # start AFTER heading break if start_idx is None: print(f"No changelog section found for {ver}", file=sys.stderr) sys.exit(3) end_idx = len(lines) for j in range(start_idx, len(lines)): if lines[j].lstrip().startswith("## "): end_idx = j break section_lines = lines[start_idx:end_idx] while section_lines and not section_lines[0].strip(): section_lines.pop(0) while section_lines and not section_lines[-1].strip(): section_lines.pop() section = "\n".join(section_lines) only_links = re.fullmatch(r"(?:\[[^\]]+\]:\s*\S+\s*(?:\n|$))*", section or "", flags=re.MULTILINE) if not section or only_links: print(f"Changelog section for {ver} is empty", file=sys.stderr) sys.exit(4) pathlib.Path("RELEASE_NOTES.md").write_text(section, encoding="utf-8") - name: Upload release notes uses: actions/upload-artifact@v4 with: name: release-notes path: RELEASE_NOTES.md publish: needs: [build, changelog] runs-on: ubuntu-latest environment: pypi permissions: id-token: write # required for PyPI Trusted Publishing (OIDC) contents: read steps: - name: Download all dists uses: actions/download-artifact@v4 with: pattern: dist-* merge-multiple: true path: dist - name: Publish to PyPI via OIDC uses: pypa/gh-action-pypi-publish@release/v1 with: verbose: true github_release: needs: [build, changelog] runs-on: ubuntu-latest permissions: contents: write steps: - name: Download all dists uses: actions/download-artifact@v4 with: pattern: dist-* merge-multiple: true path: dist - uses: actions/download-artifact@v4 with: name: release-notes path: . - name: Create GitHub Release uses: softprops/action-gh-release@v2 with: tag_name: ${{ github.ref_name }} name: ${{ github.ref_name }} body_path: RELEASE_NOTES.md files: | dist/* ================================================ FILE: .github/workflows/test.yml ================================================ --- name: Test on: workflow_dispatch: pull_request: branches: - main jobs: build_wheels: name: Test on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.13" - name: Install run: | script/setup --dev --transformers --sherpa - name: Run tests run: script/test ================================================ FILE: .gitignore ================================================ .DS_Store .idea *.log tmp/ *.py[cod] *.egg *.egg-info/ build htmlcov /.venv/ .mypy_cache/ __pycache__/ /dist/ /local/ ================================================ FILE: .isort.cfg ================================================ [settings] multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 use_parentheses=True line_length=88 ================================================ FILE: .projectile ================================================ - /.venv/ ================================================ FILE: CHANGELOG.md ================================================ # Changelog ## 3.1.1 (unrelased) - Fix transformers language - Add initial prompt to transformers ## 3.1.0 - Refactor to dynamically load models - Only prefer Parakeet for English (other languages don't detect reliably) - Add `--vad-filter`, `--vad-threshold`, `--vad-min-speech-ms`, `--vad-min-silence-ms` (thanks @lmoe) - Add `zeroconf` to Docker image ## 3.0.2 - Set `--data-dir /data` in Docker run script ## 3.0.1 - Fix model auto selection logic ## 3.0.0 - Add support for `sherpa-onnx` and Nvidia's parakeet model - Add support for [GigaAM](https://github.com/salute-developers/GigaAM) for Russian via [`onnx-asr`](https://github.com/istupakov/onnx-asr) - Add `--stt-library` to select speech-to-text library (deprecate `--use-transformers`) - Default `--model` to "auto" (prefer parakeet) - Add Docker build here - Default `--language` to "auto" - Add `--cpu-threads` for faster-whisper (@Zerwin) ## 2.5.0 - Add support for HuggingFace transformers Whisper models (--use-transformers) ## 2.4.0 - Add "auto" for model and beam size (0) to select values based on CPU ## 2.3.0 - Bump faster-whisper package to 1.1.0 - Supports model `turbo` for faster processing ## 2.2.0 - Bump faster-whisper package to 1.0.3 ## 2.1.0 - Added `--initial-prompt` (see https://github.com/openai/whisper/discussions/963) ## 2.0.0 - Use faster-whisper PyPI package - `--model` can now be a HuggingFace model like `Systran/faster-distil-whisper-small.en` ## 1.1.0 - Fix enum use for Python 3.11+ - Add tests and Github actions - Bump tokenizers to 0.15 - Bump wyoming to 1.5.2 ## 1.0.0 - Initial release ================================================ FILE: Dockerfile ================================================ FROM debian:bookworm-slim ARG TARGETARCH ARG TARGETVARIANT # Install faster-whisper WORKDIR /usr/src COPY ./pyproject.toml ./ RUN \ apt-get update \ && apt-get install -y --no-install-recommends \ python3 \ python3-pip \ python3-venv \ \ && python3 -m venv .venv \ && .venv/bin/pip3 install --no-cache-dir -U \ setuptools \ wheel \ && .venv/bin/pip3 install --no-cache-dir \ --extra-index-url 'https://download.pytorch.org/whl/cpu' \ 'torch==2.6.0' \ \ && .venv/bin/pip3 install --no-cache-dir \ --extra-index-url https://www.piwheels.org/simple \ -e '.[zeroconf,transformers,sherpa,onnx-asr]' \ \ && rm -rf /var/lib/apt/lists/* COPY ./ ./ EXPOSE 10400 ENTRYPOINT ["bash", "docker_run.sh"] ================================================ FILE: LICENSE.md ================================================ MIT License Copyright (c) 2025 Michael Hansen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Wyoming Faster Whisper [Wyoming protocol](https://github.com/rhasspy/wyoming) server for the [faster-whisper](https://github.com/guillaumekln/faster-whisper/) speech to text system. ## Home Assistant Add-on [![Show add-on](https://my.home-assistant.io/badges/supervisor_addon.svg)](https://my.home-assistant.io/redirect/supervisor_addon/?addon=core_whisper) [Source](https://github.com/home-assistant/addons/tree/master/whisper) ## Local Install Clone the repository and set up Python virtual environment: ``` sh git clone https://github.com/rhasspy/wyoming-faster-whisper.git cd wyoming-faster-whisper script/setup ``` Run a server anyone can connect to: ```sh script/run --model tiny-int8 --language en --uri 'tcp://0.0.0.0:10300' --data-dir /data --download-dir /data ``` The `--model` can also be a HuggingFace model like `Systran/faster-distil-whisper-small.en` **NOTE**: Models are downloaded to the first `--data-dir` directory. ## Docker Image ``` sh docker run -it -p 10300:10300 -v /path/to/local/data:/data rhasspy/wyoming-whisper \ --model tiny-int8 --language en ``` **NOTE**: Models are downloaded to `/data`, so make sure this points to a Docker volume. [Source](https://github.com/rhasspy/wyoming-addons/tree/master/whisper) ================================================ FILE: docker_run.sh ================================================ #!/usr/bin/env bash cd /usr/src .venv/bin/python3 -m wyoming_faster_whisper \ --uri 'tcp://0.0.0.0:10300' --data-dir '/data' "$@" ================================================ FILE: mypy.ini ================================================ [mypy] ignore_missing_imports = true [mypy-setuptools.*] ignore_missing_imports = True ================================================ FILE: pylintrc ================================================ [MASTER] ignore=faster_whisper [MESSAGES CONTROL] disable= format, abstract-method, cyclic-import, duplicate-code, global-statement, import-outside-toplevel, inconsistent-return-statements, locally-disabled, not-context-manager, too-few-public-methods, too-many-arguments, too-many-branches, too-many-instance-attributes, too-many-lines, too-many-locals, too-many-public-methods, too-many-return-statements, too-many-statements, too-many-boolean-expressions, unnecessary-pass, unused-argument, broad-except, too-many-nested-blocks, invalid-name, unused-import, fixme, useless-super-delegation, missing-module-docstring, missing-class-docstring, missing-function-docstring, import-error, consider-using-with, too-many-positional-arguments [FORMAT] expected-line-ending-format=LF ================================================ FILE: pyproject.toml ================================================ [project] name = "wyoming-faster-whisper" version = "3.1.1" description = "Wyoming Server for Faster Whisper" readme = "README.md" requires-python = ">=3.8" license = {text = "MIT"} authors = [ {name = "Michael Hansen", email = "mike@rhasspy.org"} ] keywords = ["rhasspy", "wyoming", "whisper", "stt"] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "Topic :: Multimedia :: Sound/Audio :: Speech", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ] dependencies = [ "wyoming>=1.8,<2", "faster-whisper>=1.2.1,<2", ] [project.urls] Homepage = "http://github.com/rhasspy/wyoming-faster-whisper" [tool.setuptools] platforms = ["any"] zip-safe = true include-package-data = true [tool.setuptools.packages.find] include = ["wyoming_faster_whisper"] exclude = ["tests"] [project.scripts] wyoming-faster-whisper = "wyoming_faster_whisper.__main__:run" [build-system] requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" [tool.black] line-length = 88 [tool.isort] profile = "black" [tool.pytest.ini_options] asyncio_mode = "auto" [tool.mypy] check_untyped_defs = true disallow_untyped_defs = true [project.optional-dependencies] dev = [ "black", "flake8", "isort", "mypy", "pylint", "pytest", "pytest-asyncio", ] transformers = [ "transformers[torch]==4.52.4", ] sherpa = [ "sherpa-onnx>=1.12.19,<2", ] onnx_asr = [ "onnx-asr[cpu,hub]==0.7.0", ] zeroconf = [ "wyoming[zeroconf]", ] ================================================ FILE: script/format ================================================ #!/usr/bin/env python3 import subprocess import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _VENV_DIR = _PROGRAM_DIR / ".venv" _MODULE = _PROGRAM_DIR.name.replace("-", "_") _MODULE_DIR = _PROGRAM_DIR / _MODULE if _VENV_DIR.exists(): context = venv.EnvBuilder().ensure_directories(_VENV_DIR) python_exe = context.env_exe else: python_exe = "python3" subprocess.check_call([python_exe, "-m", "black", str(_MODULE_DIR)]) subprocess.check_call([python_exe, "-m", "isort", str(_MODULE_DIR)]) ================================================ FILE: script/lint ================================================ #!/usr/bin/env python3 import subprocess import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _VENV_DIR = _PROGRAM_DIR / ".venv" _MODULE = _PROGRAM_DIR.name.replace("-", "_") _MODULE_DIR = _PROGRAM_DIR / _MODULE if _VENV_DIR.exists(): context = venv.EnvBuilder().ensure_directories(_VENV_DIR) python_exe = context.env_exe else: python_exe = "python3" subprocess.check_call([python_exe, "-m", "black", str(_MODULE_DIR), "--check"]) subprocess.check_call([python_exe, "-m", "isort", str(_MODULE_DIR), "--check"]) subprocess.check_call([python_exe, "-m", "flake8", str(_MODULE_DIR)]) subprocess.check_call([python_exe, "-m", "pylint", str(_MODULE_DIR)]) subprocess.check_call([python_exe, "-m", "mypy", str(_MODULE_DIR)]) ================================================ FILE: script/package ================================================ #!/usr/bin/env python3 import shutil import subprocess import sys import platform import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _VENV_DIR = _PROGRAM_DIR / ".venv" _MODULE = _PROGRAM_DIR.name.replace("-", "_") _MODULE_DIR = _PROGRAM_DIR / _MODULE _SRC_LIB_DIR = _PROGRAM_DIR / "lib" _DEST_LIB_DIR = _MODULE_DIR / "lib" if _VENV_DIR.exists(): context = venv.EnvBuilder().ensure_directories(_VENV_DIR) python_exe = context.env_exe else: python_exe = "python3" machine = platform.machine().lower() is_arm = ("arm" in machine) or ("aarch" in machine) is_amd64 = machine in ("x86_64", "amd64") system = sys.platform platform_name = "" if system.startswith("linux"): if is_arm: platform_name = "linux_arm64" if is_amd64: platform_name = "linux_amd64" if system == "win32": if is_amd64: platform_name = "windows_amd64" if system == "darwin": if is_arm: platform_name = "darwin_arm64" if not platform_name: raise SystemExit("Unsupported platform") platform_src_dir = _SRC_LIB_DIR / platform_name platform_dest_dir = _DEST_LIB_DIR if _DEST_LIB_DIR.exists(): shutil.rmtree(_DEST_LIB_DIR) shutil.copytree(platform_src_dir, platform_dest_dir) subprocess.check_call([python_exe, "-m", "build", "--wheel", "--sdist"]) ================================================ FILE: script/run ================================================ #!/usr/bin/env python3 import sys import subprocess import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _MODULE = _PROGRAM_DIR.name.replace("-", "_") _VENV_DIR = _PROGRAM_DIR / ".venv" if _VENV_DIR.exists(): context = venv.EnvBuilder().ensure_directories(_VENV_DIR) python_exe = context.env_exe else: python_exe = "python3" subprocess.check_call([python_exe, "-m", _MODULE] + sys.argv[1:]) ================================================ FILE: script/setup ================================================ #!/usr/bin/env python3 import argparse import subprocess import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _VENV_DIR = _PROGRAM_DIR / ".venv" parser = argparse.ArgumentParser() parser.add_argument("--dev", action="store_true", help="Install dev requirements") parser.add_argument( "--transformers", action="store_true", help="Install transformers requirements" ) parser.add_argument("--sherpa", action="store_true", help="Install sherpa requirements") args = parser.parse_args() # Create virtual environment builder = venv.EnvBuilder(with_pip=True) context = builder.ensure_directories(_VENV_DIR) builder.create(_VENV_DIR) # Upgrade dependencies pip = [context.env_exe, "-m", "pip"] subprocess.check_call(pip + ["install", "--upgrade", "pip"]) subprocess.check_call(pip + ["install", "--upgrade", "setuptools", "wheel"]) extras = [] if args.dev: extras.append("dev") if args.transformers: extras.append("transformers") if args.sherpa: extras.append("sherpa") extras_str = "" if extras: extras_str = "[" + ",".join(extras) + "]" # Install requirements subprocess.check_call(pip + ["install", "-e", f"{_PROGRAM_DIR}{extras_str}"]) ================================================ FILE: script/test ================================================ #!/usr/bin/env python3 import subprocess import venv from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _VENV_DIR = _PROGRAM_DIR / ".venv" _TESTS_DIR = _PROGRAM_DIR / "tests" if _VENV_DIR.exists(): context = venv.EnvBuilder().ensure_directories(_VENV_DIR) python_exe = context.env_exe else: python_exe = "python3" subprocess.check_call([python_exe, "-m", "pytest", str(_TESTS_DIR)]) ================================================ FILE: setup.cfg ================================================ [flake8] # To work with Black max-line-length = 88 # E501: line too long # W503: Line break occurred before a binary operator # E203: Whitespace before ':' # D202 No blank lines allowed after function docstring # W504 line break after binary operator ignore = E501, W503, E203, D202, W504 exclude = wyoming_faster_whisper/faster_whisper [isort] multi_line_output = 3 include_trailing_comma=True force_grid_wrap=0 use_parentheses=True line_length=88 indent = " " ================================================ FILE: tests/__init__.py ================================================ """Tests for wyoming-faster-whisper.""" from pathlib import Path _DIR = Path(__file__).parent _PROGRAM_DIR = _DIR.parent _LOCAL_DIR = _PROGRAM_DIR / "local" _SAMPLES_PER_CHUNK = 1024 # Need to give time for the model to download _START_TIMEOUT = 60 _TRANSCRIBE_TIMEOUT = 60 ================================================ FILE: tests/test_faster_whisper.py ================================================ """Tests for faster-whisper.""" import asyncio import re import sys import wave from asyncio.subprocess import PIPE import pytest from wyoming.asr import Transcribe, Transcript from wyoming.audio import AudioStart, AudioStop, wav_to_chunks from wyoming.event import async_read_event, async_write_event from wyoming.info import Describe, Info from . import _LOCAL_DIR, _SAMPLES_PER_CHUNK, _START_TIMEOUT, _TRANSCRIBE_TIMEOUT, _DIR @pytest.mark.parametrize( ("stt_library", "model"), [ ("faster-whisper", "base-int8"), ("transformers", "openai/whisper-base.en"), ("sherpa", "auto"), ], ) @pytest.mark.asyncio async def test_faster_whisper(stt_library: str, model: str) -> None: proc = await asyncio.create_subprocess_exec( sys.executable, "-m", "wyoming_faster_whisper", "--uri", "stdio://", "--stt-library", stt_library, "--model", model, "--data-dir", str(_LOCAL_DIR), "--language", "en", "--vad-filter", stdin=PIPE, stdout=PIPE, ) assert proc.stdin is not None assert proc.stdout is not None # Check info await async_write_event(Describe().event(), proc.stdin) while True: event = await asyncio.wait_for( async_read_event(proc.stdout), timeout=_START_TIMEOUT ) assert event is not None if not Info.is_type(event.type): continue info = Info.from_event(event) assert len(info.asr) == 1, "Expected one asr service" break await async_write_event(Transcribe().event(), proc.stdin) # Test known WAV with wave.open(str(_DIR / "turn_on_the_living_room_lamp.wav"), "rb") as example_wav: await async_write_event( AudioStart( rate=example_wav.getframerate(), width=example_wav.getsampwidth(), channels=example_wav.getnchannels(), ).event(), proc.stdin, ) for chunk in wav_to_chunks(example_wav, _SAMPLES_PER_CHUNK): await async_write_event(chunk.event(), proc.stdin) await async_write_event(AudioStop().event(), proc.stdin) while True: event = await asyncio.wait_for( async_read_event(proc.stdout), timeout=_TRANSCRIBE_TIMEOUT ) assert event is not None if not Transcript.is_type(event.type): continue transcript = Transcript.from_event(event) text = transcript.text.lower().strip() text = re.sub(r"[^a-z ]", "", text) assert text == "turn on the living room lamp" break # Need to close stdin for graceful termination proc.stdin.close() _, stderr = await proc.communicate() assert proc.returncode == 0, stderr.decode() ================================================ FILE: wyoming_faster_whisper/__init__.py ================================================ """Wyoming server for faster-whisper.""" from importlib.metadata import version __version__ = version("wyoming_faster_whisper") __all__ = ["__version__"] ================================================ FILE: wyoming_faster_whisper/__main__.py ================================================ #!/usr/bin/env python3 import argparse import asyncio import logging import platform import re from functools import partial from typing import Any, Dict, Optional import faster_whisper from wyoming.info import AsrModel, AsrProgram, Attribution, Info from wyoming.server import AsyncServer, AsyncTcpServer from . import __version__ from .const import AUTO_LANGUAGE, AUTO_MODEL, PARAKEET_LANGUAGES, SttLibrary from .dispatch_handler import DispatchEventHandler from .models import ModelLoader _LOGGER = logging.getLogger(__name__) async def main() -> None: """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument("--uri", required=True, help="unix:// or tcp://") # parser.add_argument( "--zeroconf", nargs="?", const="faster-whisper", help="Enable discovery over zeroconf with optional name (default: faster-whisper)", ) # parser.add_argument( "--model", default=AUTO_MODEL, help=f"Name of model to use (or {AUTO_MODEL})" ) parser.add_argument( "--data-dir", required=True, action="append", help="Data directory to check for downloaded models", ) parser.add_argument( "--download-dir", help="Directory to download models into (default: first data dir)", ) parser.add_argument( "--device", default="cpu", help="Device to use for inference (default: cpu)", ) parser.add_argument( "--language", default=AUTO_LANGUAGE, help=f"Default language to set for transcription (default: {AUTO_LANGUAGE})", ) parser.add_argument( "--compute-type", default="default", help="Compute type (float16, int8, etc.)", ) parser.add_argument( "--beam-size", type=int, default=0, help="Size of beam during decoding (0 for auto)", ) parser.add_argument( "--cpu-threads", default=4, type=int, help="Number of CPU threads to use for inference (default: 4, faster-whisper and sherpa-onnx)", ) parser.add_argument( "--initial-prompt", help="Optional text to provide as a prompt for the first window (faster-whisper only)", ) parser.add_argument( "--vad-filter", action="store_true", help="Enable Silero VAD to filter out non-speech which can reduce hallucinations (default: false, faster-whisper only)", ) parser.add_argument( "--vad-threshold", type=float, default=0.5, help="VAD speech probability threshold (default: 0.5, faster-whisper only)", ) parser.add_argument( "--vad-min-speech-ms", type=int, default=250, help="VAD minimum speech duration in ms (default: 250, faster-whisper only)", ) parser.add_argument( "--vad-min-silence-ms", type=int, default=2000, help="VAD minimum silence duration in ms to split (default: 2000, faster-whisper only)", ) parser.add_argument( "--stt-library", choices=[lib.value for lib in SttLibrary], default=SttLibrary.AUTO, help="Set library to use for speech-to-text (may require extra dependencies)", ) parser.add_argument( "--local-files-only", action="store_true", help="Don't check HuggingFace hub for updates every time", ) # parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") parser.add_argument( "--log-format", default=logging.BASIC_FORMAT, help="Format for log messages" ) parser.add_argument( "--version", action="version", version=__version__, help="Print version and exit", ) args = parser.parse_args() if not args.download_dir: # Download to first data dir by default args.download_dir = args.data_dir[0] logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format=args.log_format ) _LOGGER.debug(args) args.stt_library = SttLibrary(args.stt_library) machine = platform.machine().lower() is_arm = ("arm" in machine) or ("aarch" in machine) if args.beam_size <= 0: args.beam_size = 1 if is_arm else 5 _LOGGER.debug("Beam size automatically selected: %s", args.beam_size) # Resolve model name model_name = args.model model_match = re.match(r"^(tiny|base|small|medium)[.-]int8$", args.model) if model_match: # Original models re-uploaded to huggingface model_size = model_match.group(1) model_name = f"{model_size}-int8" args.model = f"rhasspy/faster-whisper-{model_name}" if args.language == AUTO_LANGUAGE: # Whisper does not understand auto args.language = None if args.model == AUTO_MODEL: args.model = None wyoming_info = Info( asr=[ AsrProgram( name="faster-whisper", description="Faster Whisper transcription with CTranslate2", attribution=Attribution( name="Guillaume Klein", url="https://github.com/guillaumekln/faster-whisper/", ), installed=True, version=__version__, models=[ AsrModel( name=model_name, description=model_name, attribution=Attribution( name="Systran", url="https://huggingface.co/Systran", ), installed=True, languages=sorted( list( # pylint: disable=protected-access set(faster_whisper.tokenizer._LANGUAGE_CODES).union( PARAKEET_LANGUAGES ) ) ), version=faster_whisper.__version__, ) ], ) ], ) vad_parameters: Optional[Dict[str, Any]] = None if args.vad_filter: vad_parameters = { "threshold": args.vad_threshold, "min_speech_duration_ms": args.vad_min_speech_ms, "min_silence_duration_ms": args.vad_min_silence_ms, } loader = ModelLoader( preferred_stt_library=args.stt_library, preferred_language=args.language, download_dir=args.download_dir, local_files_only=args.local_files_only, model=args.model, compute_type=args.compute_type, device=args.device, beam_size=args.beam_size, cpu_threads=args.cpu_threads, initial_prompt=args.initial_prompt, vad_parameters=vad_parameters, ) # Load model _LOGGER.debug("Pre-loading transcriber") await loader.load_transcriber() server = AsyncServer.from_uri(args.uri) if args.zeroconf: if not isinstance(server, AsyncTcpServer): raise ValueError("Zeroconf requires tcp:// uri") from wyoming.zeroconf import HomeAssistantZeroconf tcp_server: AsyncTcpServer = server hass_zeroconf = HomeAssistantZeroconf( name=args.zeroconf, port=tcp_server.port, host=tcp_server.host ) await hass_zeroconf.register_server() _LOGGER.debug("Zeroconf discovery enabled") _LOGGER.info("Ready") await server.run( partial( DispatchEventHandler, wyoming_info, loader, ) ) # ----------------------------------------------------------------------------- def run() -> None: asyncio.run(main()) if __name__ == "__main__": try: run() except KeyboardInterrupt: pass ================================================ FILE: wyoming_faster_whisper/const.py ================================================ """Constants.""" from abc import ABC, abstractmethod from enum import Enum from pathlib import Path from typing import Optional, Union class SttLibrary(str, Enum): """Speech-to-text library.""" AUTO = "auto" FASTER_WHISPER = "faster-whisper" TRANSFORMERS = "transformers" SHERPA = "sherpa" ONNX_ASR = "onnx-asr" AUTO_LANGUAGE = "auto" AUTO_MODEL = "auto" PARAKEET_LANGUAGES = { "bg", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "de", "el", "hu", "it", "lv", "lt", "mt", "pl", "pt", "ro", "sk", "sl", "es", "sv", "ru", "uk", } class Transcriber(ABC): """Base class for transcribers.""" @abstractmethod def transcribe( self, wav_path: Union[str, Path], language: Optional[str], beam_size: int = 5, initial_prompt: Optional[str] = None, ) -> str: pass ================================================ FILE: wyoming_faster_whisper/dispatch_handler.py ================================================ """Event handler for clients of the server.""" import asyncio import logging import os import tempfile import wave from typing import Optional from wyoming.asr import Transcribe, Transcript from wyoming.audio import AudioChunk, AudioChunkConverter, AudioStop from wyoming.event import Event from wyoming.info import Describe, Info from wyoming.server import AsyncEventHandler from .const import Transcriber from .models import ModelLoader _LOGGER = logging.getLogger(__name__) class DispatchEventHandler(AsyncEventHandler): """Dispatches to appropriate transcriber.""" def __init__( self, wyoming_info: Info, loader: ModelLoader, *args, **kwargs, ) -> None: super().__init__(*args, **kwargs) self.wyoming_info_event = wyoming_info.event() self._loader = loader self._transcriber: Optional[Transcriber] = None self._transcriber_future: Optional[asyncio.Future] = None self._language: Optional[str] = None self._wav_dir = tempfile.TemporaryDirectory() self._wav_path = os.path.join(self._wav_dir.name, "speech.wav") self._wav_file: Optional[wave.Wave_write] = None self._audio_converter = AudioChunkConverter(rate=16000, width=2, channels=1) async def handle_event(self, event: Event) -> bool: if AudioChunk.is_type(event.type): # Audio is saved to a WAV file for transcription later. # None of the underlying models support streaming. chunk = self._audio_converter.convert(AudioChunk.from_event(event)) if self._wav_file is None: self._wav_file = wave.open(self._wav_path, "wb") self._wav_file.setframerate(chunk.rate) self._wav_file.setsampwidth(chunk.width) self._wav_file.setnchannels(chunk.channels) self._wav_file.writeframes(chunk.audio) if (self._transcriber is None) and (self._transcriber_future is None): # Load the transcriber in the background. # Hopefully it's ready by the time the audio stops. self._transcriber_future = asyncio.create_task( self._loader.load_transcriber(self._language) ) return True if AudioStop.is_type(event.type): _LOGGER.debug("Audio stoppped") if self._transcriber is None: # Get transcriber that was loading in the background assert self._transcriber_future is not None self._transcriber = await self._transcriber_future assert self._transcriber is not None assert self._wav_file is not None self._wav_file.close() self._wav_file = None # Do transcription in a separate thread text = await asyncio.to_thread( self._transcriber.transcribe, self._wav_path, self._language, beam_size=self._loader.beam_size, initial_prompt=self._loader.initial_prompt, ) _LOGGER.info(text) await self.write_event(Transcript(text=text).event()) _LOGGER.debug("Completed request") # Reset self._language = None self._transcriber = None return False if Transcribe.is_type(event.type): transcribe = Transcribe.from_event(event) self._language = transcribe.language or self._loader.preferred_language _LOGGER.debug("Language set to %s", self._language) return True if Describe.is_type(event.type): await self.write_event(self.wyoming_info_event) _LOGGER.debug("Sent info") return True return True ================================================ FILE: wyoming_faster_whisper/faster_whisper_handler.py ================================================ """Event handler for clients of the server.""" from pathlib import Path from typing import Any, Dict, Optional, Union import faster_whisper from .const import Transcriber class FasterWhisperTranscriber(Transcriber): """Event handler for clients.""" def __init__( self, model_id: str, cache_dir: Union[str, Path], device: str = "cpu", compute_type: str = "default", cpu_threads: int = 4, vad_parameters: Optional[Dict[str, Any]] = None, ) -> None: self.vad_filter = vad_parameters is not None self.vad_parameters = vad_parameters self.model = faster_whisper.WhisperModel( model_id, download_root=str(cache_dir), device=device, compute_type=compute_type, cpu_threads=cpu_threads, ) def transcribe( self, wav_path: Union[str, Path], language: Optional[str], beam_size: int = 5, initial_prompt: Optional[str] = None, ) -> str: segments, _info = self.model.transcribe( str(wav_path), beam_size=beam_size, language=language, initial_prompt=initial_prompt, vad_filter=self.vad_filter, vad_parameters=self.vad_parameters, ) text = " ".join(segment.text for segment in segments) return text ================================================ FILE: wyoming_faster_whisper/models.py ================================================ """Logic for model selection, loading, and transcription.""" import asyncio import logging import platform from collections import defaultdict from pathlib import Path from typing import Any, Dict, Optional, Tuple, Union from .const import SttLibrary, Transcriber from .faster_whisper_handler import FasterWhisperTranscriber _LOGGER = logging.getLogger(__name__) TRANSCRIBER_KEY = Tuple[SttLibrary, str] # model id class ModelLoader: """Load transcribers for models.""" def __init__( self, preferred_stt_library: SttLibrary, preferred_language: Optional[str], download_dir: Union[str, Path], local_files_only: bool, model: Optional[str], compute_type: str, device: str, beam_size: int, cpu_threads: int, initial_prompt: Optional[str], vad_parameters: Optional[Dict[str, Any]], ) -> None: self.preferred_stt_library = preferred_stt_library self.preferred_language = preferred_language self.download_dir = Path(download_dir) self.local_files_only = local_files_only # faster-whisper only self.model = model self.compute_type = compute_type self.device = device self.beam_size = beam_size self.cpu_threads = cpu_threads self.initial_prompt = initial_prompt self.vad_parameters = vad_parameters self._transcriber: Dict[TRANSCRIBER_KEY, Transcriber] = {} self._transcriber_lock: Dict[TRANSCRIBER_KEY, asyncio.Lock] = defaultdict( asyncio.Lock ) async def load_transcriber(self, language: Optional[str] = None) -> Transcriber: """Load or get transcriber from cache for a language.""" language = language or self.preferred_language stt_library = self.preferred_stt_library # Check dependencies try: from .sherpa_handler import SherpaTranscriber has_sherpa = True _LOGGER.debug("Sherpa is available") except ImportError: has_sherpa = False _LOGGER.debug("Sherpa is NOT available") try: from .transformers_whisper import TransformersTranscriber has_transformers = True _LOGGER.debug("Transformers library is available") except ImportError: has_transformers = False _LOGGER.debug("Transformers library is NOT available") try: from .onnx_asr_handler import OnnxAsrTranscriber has_onnx_asr = True _LOGGER.debug("Onnx-ASR is available") except ImportError: has_onnx_asr = False _LOGGER.debug("Onnx-ASR is NOT available") # Select speech-to-text library if stt_library == SttLibrary.AUTO: # Default to faster-whisper stt_library = SttLibrary.FASTER_WHISPER if self.model is None: # auto if (language == "ru") and has_onnx_asr: # Prefer GigaAM via onnx-asr stt_library = SttLibrary.ONNX_ASR elif (language == "en") and has_sherpa: # Prefer Parakeet via sherpa for English. # The v3 Parakeet model claims to auto detect other # languages, but it doesn't work. stt_library = SttLibrary.SHERPA elif ( ((stt_library == SttLibrary.TRANSFORMERS) and (not has_transformers)) or ((stt_library == SttLibrary.SHERPA) and (not has_sherpa)) or ((stt_library == SttLibrary.ONNX_ASR) and (not has_onnx_asr)) ): # Fall back to faster-whisper stt_library = SttLibrary.FASTER_WHISPER _LOGGER.debug("Falling back to faster-whisper (missing dependencies)") # Select model model = self.model if model is None: # auto machine = platform.machine().lower() is_arm = ("arm" in machine) or ("aarch" in machine) model = guess_model(stt_library, language, is_arm) _LOGGER.debug( "Selected stt-library '%s' with model '%s'", stt_library.value, model ) # Load transcriber assert stt_library != SttLibrary.AUTO assert model key = (stt_library, model) async with self._transcriber_lock[key]: transcriber = self._transcriber.get(key) if transcriber is not None: return transcriber if stt_library == SttLibrary.SHERPA: from .sherpa_handler import SherpaTranscriber # noqa: F811 transcriber = SherpaTranscriber( model, self.download_dir, cpu_threads=self.cpu_threads ) elif stt_library == SttLibrary.ONNX_ASR: from .onnx_asr_handler import OnnxAsrTranscriber # noqa: F811 transcriber = OnnxAsrTranscriber( model, cache_dir=self.download_dir, local_files_only=self.local_files_only, ) elif stt_library == SttLibrary.TRANSFORMERS: from .transformers_whisper import TransformersTranscriber # noqa: F811 transcriber = TransformersTranscriber( model, cache_dir=self.download_dir, local_files_only=self.local_files_only, ) else: transcriber = FasterWhisperTranscriber( model, cache_dir=self.download_dir, device=self.device, compute_type=self.compute_type, cpu_threads=self.cpu_threads, vad_parameters=self.vad_parameters, ) self._transcriber[key] = transcriber return transcriber async def transcribe( self, wav_path: Union[str, Path], language: Optional[str] ) -> str: """Transcribe WAV file using appropriate transcriber. Assume WAV file is 16Khz 16-bit mono PCM. """ transcriber = await self.load_transcriber(language) text = await asyncio.to_thread( transcriber.transcribe, wav_path, language=language, beam_size=self.beam_size, initial_prompt=self.initial_prompt, ) _LOGGER.debug("Transcribed audio: %s", text) return text def guess_model(stt_library: SttLibrary, language: Optional[str], is_arm: bool) -> str: """Automatically guess STT model id.""" if stt_library == SttLibrary.SHERPA: if language == "en": return "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8" # Non-English return "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8" if stt_library == SttLibrary.TRANSFORMERS: if language == "en": if is_arm: return "openai/whisper-tiny.en" return "openai/whisper-base.en" # Non-English if is_arm: return "openai/whisper-tiny" return "openai/whisper-base" if stt_library == SttLibrary.ONNX_ASR: return "gigaam-v2-rnnt" # faster-whisper if is_arm: return "rhasspy/faster-whisper-tiny-int8" return "rhasspy/faster-whisper-base-int8" ================================================ FILE: wyoming_faster_whisper/onnx_asr_handler.py ================================================ """Code for transcription using the onnx-asr library.""" import wave from pathlib import Path from typing import Optional, Union from unittest.mock import patch import numpy as np import onnx_asr from huggingface_hub import snapshot_download from .const import Transcriber _RATE = 16000 class OnnxAsrTranscriber(Transcriber): """Wrapper for onnx-asr model.""" def __init__( self, model_id: str, cache_dir: Union[str, Path], local_files_only: bool ) -> None: """Initialize model.""" # Force download to our cache dir def snapshot_download_with_cache(*args, **kwargs) -> str: kwargs["cache_dir"] = str(Path(cache_dir).resolve()) kwargs["local_files_only"] = local_files_only return snapshot_download(*args, **kwargs) with patch("huggingface_hub.snapshot_download", snapshot_download_with_cache): self.onnx_model = onnx_asr.load_model(model_id) def transcribe( self, wav_path: Union[str, Path], language: Optional[str], beam_size: int = 5, initial_prompt: Optional[str] = None, ) -> str: """Returns transcription for WAV file. WAV file must be 16Khz 16-bit mono audio. """ wav_file: wave.Wave_read = wave.open(str(wav_path), "rb") with wav_file: assert wav_file.getframerate() == _RATE, "Sample rate must be 16Khz" assert wav_file.getsampwidth() == 2, "Width must be 16-bit (2 bytes)" assert wav_file.getnchannels() == 1, "Audio must be mono" audio_bytes = wav_file.readframes(wav_file.getnframes()) audio_array = ( np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 ) recognize_kwargs = {} if language: recognize_kwargs["language"] = language text = self.onnx_model.recognize( # type: ignore[call-overload] audio_array, sample_rate=_RATE, **recognize_kwargs ) return text ================================================ FILE: wyoming_faster_whisper/sherpa_handler.py ================================================ """Code for transcription using the sherpa-onnx library.""" import logging import shutil import tarfile import urllib.request import wave from pathlib import Path from typing import Optional, Union import numpy as np import sherpa_onnx as so from .const import Transcriber _LOGGER = logging.getLogger(__name__) _RATE = 16000 _URL_FORMAT = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/{model_id}.tar.bz2" class SherpaTranscriber(Transcriber): """Wrapper for sherpa-onnx model.""" def __init__( self, model_id: str, cache_dir: Union[str, Path], cpu_threads: int = 4, ) -> None: """Initialize model.""" cache_dir = Path(cache_dir) model_dir = cache_dir / model_id _LOGGER.debug("Looking for sherpa model: %s", model_dir) if not model_dir.exists(): url = _URL_FORMAT.format(model_id=model_id) _LOGGER.info("Downloading %s", url) cache_dir.mkdir(parents=True, exist_ok=True) try: # Download/extract to cache dir. # We assume that the .tar.bz2 contains a directory named after # the model id. with urllib.request.urlopen(url) as response: with tarfile.open(fileobj=response, mode="r|bz2") as tar: for member in tar: tar.extract(member, path=cache_dir) except Exception: # Delete directory so we'll download again next time shutil.rmtree(model_dir, ignore_errors=True) raise # Load model self.recognizer = so.OfflineRecognizer.from_transducer( num_threads=cpu_threads, encoder=f"{model_dir}/encoder.int8.onnx", decoder=f"{model_dir}/decoder.int8.onnx", joiner=f"{model_dir}/joiner.int8.onnx", tokens=f"{model_dir}/tokens.txt", provider="cpu", model_type="nemo_transducer", ) # Prime model so that the first transcription will be fast stream = self.recognizer.create_stream() stream.accept_waveform(_RATE, np.zeros(shape=(128), dtype=np.float32)) self.recognizer.decode_stream(stream) def transcribe( self, wav_path: Union[str, Path], language: Optional[str], beam_size: int = 5, initial_prompt: Optional[str] = None, ) -> str: """Returns transcription for WAV file. WAV file must be 16Khz 16-bit mono audio. """ wav_file: wave.Wave_read = wave.open(str(wav_path), "rb") with wav_file: assert wav_file.getframerate() == _RATE, "Sample rate must be 16Khz" assert wav_file.getsampwidth() == 2, "Width must be 16-bit (2 bytes)" assert wav_file.getnchannels() == 1, "Audio must be mono" audio_bytes = wav_file.readframes(wav_file.getnframes()) audio_array = ( np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0 ) stream = self.recognizer.create_stream() stream.accept_waveform(_RATE, audio_array) self.recognizer.decode_stream(stream) return stream.result.text ================================================ FILE: wyoming_faster_whisper/transformers_whisper.py ================================================ """Code for Whisper transcription using HuggingFace's transformers library.""" import wave from pathlib import Path from typing import Optional, Union import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor from .const import Transcriber _RATE = 16000 class TransformersTranscriber(Transcriber): """Wrapper for HuggingFace transformers Whisper model.""" def __init__( self, model_id: str, cache_dir: Optional[Union[str, Path]] = None, local_files_only: bool = False, ) -> None: """Initialize Whisper model.""" self.processor = AutoProcessor.from_pretrained( model_id, cache_dir=cache_dir, local_files_only=local_files_only ) self.model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, cache_dir=cache_dir, local_files_only=local_files_only ) self.model.eval() def transcribe( self, wav_path: Union[str, Path], language: Optional[str], beam_size: int = 5, initial_prompt: Optional[str] = None, ) -> str: """Returns transcription for WAV file. WAV file must be 16Khz 16-bit mono audio. """ wav_file: wave.Wave_read = wave.open(str(wav_path), "rb") with wav_file: assert wav_file.getframerate() == _RATE, "Sample rate must be 16Khz" assert wav_file.getsampwidth() == 2, "Width must be 16-bit (2 bytes)" assert wav_file.getnchannels() == 1, "Audio must be mono" audio_bytes = wav_file.readframes(wav_file.getnframes()) audio_tensor = ( torch.frombuffer(audio_bytes, dtype=torch.int16).float() / 32768.0 ) inputs = self.processor(audio_tensor, sampling_rate=_RATE, return_tensors="pt") generate_args = {**inputs, "num_beams": beam_size} if initial_prompt: prompt_ids = ( self.processor.tokenizer( initial_prompt, return_tensors="pt", add_special_tokens=False ) .input_ids[0] .to(self.model.device) ) generate_args["prompt_ids"] = prompt_ids if language: self.processor.tokenizer.set_prefix_tokens( language=language, task="transcribe" ) with torch.no_grad(): # Ignore warning about attention_mask because we're only doing a single utterance. generated_ids = self.model.generate(**generate_args) transcription = self.processor.batch_decode( generated_ids, skip_special_tokens=True )[0] return transcription